Bleking commited on Dec 25, 2024

Commit

449959a

1 Parent(s): 5703d15

Upload llava-v1.6-vicuna-7b checkpoint

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

llava-v1.6-vicuna-7b/README.md +202 -0
llava-v1.6-vicuna-7b/adapter_config.json +34 -0
llava-v1.6-vicuna-7b/adapter_model.safetensors +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/README.md +202 -0
llava-v1.6-vicuna-7b/checkpoint-250/adapter_config.json +34 -0
llava-v1.6-vicuna-7b/checkpoint-250/adapter_model.safetensors +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/latest +1 -0
llava-v1.6-vicuna-7b/checkpoint-250/rng_state_0.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/rng_state_1.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/rng_state_2.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/rng_state_3.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/special_tokens_map.json +24 -0
llava-v1.6-vicuna-7b/checkpoint-250/tokenizer.model +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/tokenizer_config.json +43 -0
llava-v1.6-vicuna-7b/checkpoint-250/trainer_state.json +3783 -0
llava-v1.6-vicuna-7b/checkpoint-250/training_args.bin +3 -0
llava-v1.6-vicuna-7b/checkpoint-250/zero_to_fp32.py +604 -0
llava-v1.6-vicuna-7b/checkpoint-320/README.md +202 -0
llava-v1.6-vicuna-7b/checkpoint-320/adapter_config.json +34 -0
llava-v1.6-vicuna-7b/checkpoint-320/adapter_model.safetensors +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/latest +1 -0
llava-v1.6-vicuna-7b/checkpoint-320/rng_state_0.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/rng_state_1.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/rng_state_2.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/rng_state_3.pth +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/special_tokens_map.json +24 -0
llava-v1.6-vicuna-7b/checkpoint-320/tokenizer.model +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/tokenizer_config.json +43 -0
llava-v1.6-vicuna-7b/checkpoint-320/trainer_state.json +0 -0
llava-v1.6-vicuna-7b/checkpoint-320/training_args.bin +3 -0
llava-v1.6-vicuna-7b/checkpoint-320/zero_to_fp32.py +604 -0
llava-v1.6-vicuna-7b/config.json +76 -0
llava-v1.6-vicuna-7b/non_lora_trainables.bin +3 -0
llava-v1.6-vicuna-7b/optimizer.pt +3 -0

llava-v1.6-vicuna-7b/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: liuhaotian/llava-v1.6-vicuna-7b
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

llava-v1.6-vicuna-7b/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

llava-v1.6-vicuna-7b/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62efdd9d84e66cfc4095550fcdaa5b6224f73e2bd94f6ed848b1546da576e22d
+size 42421336

llava-v1.6-vicuna-7b/checkpoint-250/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: liuhaotian/llava-v1.6-vicuna-7b
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

llava-v1.6-vicuna-7b/checkpoint-250/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

llava-v1.6-vicuna-7b/checkpoint-250/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62efdd9d84e66cfc4095550fcdaa5b6224f73e2bd94f6ed848b1546da576e22d
+size 42421336

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53103e06a5d611618fdd7d4499d9a429c91d6d771bc563b84c01f326e11aaba6
+size 663858

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:677120a99891df3a884e1a8f8988c4ed0e7a41c909cb08eab86af46f13949114
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff52b819ca639c237933403220e55638b75cd7b423042db2aba6ca657a4f65cf
+size 663858

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a929c1a8a86028614c89037068984fd08221959bed2223579d48b1a6f217ff4f
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11fd3eadb7c27f2cb6679ccc1eaefe767dc262590784f2cfa3cd87f4aacd36d3
+size 663858

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a21d34bcdb10e7befcffe4fe5018d6c8fbf617f9279feb97ca5e3dba698b8577
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a37b14ef13d5890f298d26a399f2d3e79b2731d2c024051e064f284fc3f96e4e
+size 663858

llava-v1.6-vicuna-7b/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f484fda7e2347ae9431e40c3fc3be548c6671d6958f0d943d96bcb4d23ce39a0
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-250/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step250

llava-v1.6-vicuna-7b/checkpoint-250/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:593bd3951c021cbb7bc83f0c6bddb231b6a5e79c52497c8f43126c62acb2d702
+size 14960

llava-v1.6-vicuna-7b/checkpoint-250/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d241471d3a91780ba4a9fb7ebb7fd7ee2dba99fc21ec351ce86a0ee75b95fef
+size 14960

llava-v1.6-vicuna-7b/checkpoint-250/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3933f6586cc586a8dc1592325aeffe4b89c021f6a722b0dfb9bfed65dd1e1018
+size 14960

llava-v1.6-vicuna-7b/checkpoint-250/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75c1e54d38ed160c3abfe7698770cedbec73a956b3fe6708d564ffc927944fab
+size 14960

llava-v1.6-vicuna-7b/checkpoint-250/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llava-v1.6-vicuna-7b/checkpoint-250/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llava-v1.6-vicuna-7b/checkpoint-250/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

llava-v1.6-vicuna-7b/checkpoint-250/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3783 @@

+{
+  "best_metric": 0.6768932938575745,
+  "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-250",
+  "epoch": 7.8125,
+  "eval_steps": 1.0,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03125,
+      "grad_norm": 1.0817695604199613,
+      "learning_rate": 0.0,
+      "loss": 1.3872,
+      "step": 1
+    },
+    {
+      "epoch": 0.03125,
+      "eval_loss": 1.4023343324661255,
+      "eval_runtime": 35.2562,
+      "eval_samples_per_second": 5.673,
+      "eval_steps_per_second": 0.369,
+      "step": 1
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.8573794343563677,
+      "learning_rate": 8.613531161467863e-06,
+      "loss": 1.3352,
+      "step": 2
+    },
+    {
+      "epoch": 0.0625,
+      "eval_loss": 1.4023343324661255,
+      "eval_runtime": 27.8829,
+      "eval_samples_per_second": 7.173,
+      "eval_steps_per_second": 0.466,
+      "step": 2
+    },
+    {
+      "epoch": 0.09375,
+      "grad_norm": 0.8545279010393898,
+      "learning_rate": 1.3652123889719709e-05,
+      "loss": 1.3838,
+      "step": 3
+    },
+    {
+      "epoch": 0.09375,
+      "eval_loss": 1.3825562000274658,
+      "eval_runtime": 27.9018,
+      "eval_samples_per_second": 7.168,
+      "eval_steps_per_second": 0.466,
+      "step": 3
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.7747695318679186,
+      "learning_rate": 1.7227062322935725e-05,
+      "loss": 1.3442,
+      "step": 4
+    },
+    {
+      "epoch": 0.125,
+      "eval_loss": 1.3529690504074097,
+      "eval_runtime": 27.9234,
+      "eval_samples_per_second": 7.162,
+      "eval_steps_per_second": 0.466,
+      "step": 4
+    },
+    {
+      "epoch": 0.15625,
+      "grad_norm": 0.9223438945487747,
+      "learning_rate": 2e-05,
+      "loss": 1.3265,
+      "step": 5
+    },
+    {
+      "epoch": 0.15625,
+      "eval_loss": 1.3111159801483154,
+      "eval_runtime": 27.8183,
+      "eval_samples_per_second": 7.19,
+      "eval_steps_per_second": 0.467,
+      "step": 5
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.8553066709777654,
+      "learning_rate": 2e-05,
+      "loss": 1.2969,
+      "step": 6
+    },
+    {
+      "epoch": 0.1875,
+      "eval_loss": 1.267953634262085,
+      "eval_runtime": 28.5087,
+      "eval_samples_per_second": 7.015,
+      "eval_steps_per_second": 0.456,
+      "step": 6
+    },
+    {
+      "epoch": 0.21875,
+      "grad_norm": 0.7513319744508511,
+      "learning_rate": 2e-05,
+      "loss": 1.2643,
+      "step": 7
+    },
+    {
+      "epoch": 0.21875,
+      "eval_loss": 1.2324440479278564,
+      "eval_runtime": 28.7026,
+      "eval_samples_per_second": 6.968,
+      "eval_steps_per_second": 0.453,
+      "step": 7
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5926161530676572,
+      "learning_rate": 2e-05,
+      "loss": 1.2343,
+      "step": 8
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.2082672119140625,
+      "eval_runtime": 28.709,
+      "eval_samples_per_second": 6.966,
+      "eval_steps_per_second": 0.453,
+      "step": 8
+    },
+    {
+      "epoch": 0.28125,
+      "grad_norm": 0.45585108261607465,
+      "learning_rate": 2e-05,
+      "loss": 1.2556,
+      "step": 9
+    },
+    {
+      "epoch": 0.28125,
+      "eval_loss": 1.1897780895233154,
+      "eval_runtime": 28.5026,
+      "eval_samples_per_second": 7.017,
+      "eval_steps_per_second": 0.456,
+      "step": 9
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.45306175711380503,
+      "learning_rate": 2e-05,
+      "loss": 1.1941,
+      "step": 10
+    },
+    {
+      "epoch": 0.3125,
+      "eval_loss": 1.1719207763671875,
+      "eval_runtime": 28.4252,
+      "eval_samples_per_second": 7.036,
+      "eval_steps_per_second": 0.457,
+      "step": 10
+    },
+    {
+      "epoch": 0.34375,
+      "grad_norm": 0.40702053502599356,
+      "learning_rate": 2e-05,
+      "loss": 1.2414,
+      "step": 11
+    },
+    {
+      "epoch": 0.34375,
+      "eval_loss": 1.1534627676010132,
+      "eval_runtime": 31.953,
+      "eval_samples_per_second": 6.259,
+      "eval_steps_per_second": 0.407,
+      "step": 11
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.45771435281195333,
+      "learning_rate": 2e-05,
+      "loss": 1.202,
+      "step": 12
+    },
+    {
+      "epoch": 0.375,
+      "eval_loss": 1.1343497037887573,
+      "eval_runtime": 31.7064,
+      "eval_samples_per_second": 6.308,
+      "eval_steps_per_second": 0.41,
+      "step": 12
+    },
+    {
+      "epoch": 0.40625,
+      "grad_norm": 0.49237132802399297,
+      "learning_rate": 2e-05,
+      "loss": 1.2167,
+      "step": 13
+    },
+    {
+      "epoch": 0.40625,
+      "eval_loss": 1.1149284839630127,
+      "eval_runtime": 31.7514,
+      "eval_samples_per_second": 6.299,
+      "eval_steps_per_second": 0.409,
+      "step": 13
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.4707558788321445,
+      "learning_rate": 2e-05,
+      "loss": 1.0463,
+      "step": 14
+    },
+    {
+      "epoch": 0.4375,
+      "eval_loss": 1.0956928730010986,
+      "eval_runtime": 30.7821,
+      "eval_samples_per_second": 6.497,
+      "eval_steps_per_second": 0.422,
+      "step": 14
+    },
+    {
+      "epoch": 0.46875,
+      "grad_norm": 0.44161060970171445,
+      "learning_rate": 2e-05,
+      "loss": 1.1615,
+      "step": 15
+    },
+    {
+      "epoch": 0.46875,
+      "eval_loss": 1.0776234865188599,
+      "eval_runtime": 30.5336,
+      "eval_samples_per_second": 6.55,
+      "eval_steps_per_second": 0.426,
+      "step": 15
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.43310242386256154,
+      "learning_rate": 2e-05,
+      "loss": 1.0941,
+      "step": 16
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.061128854751587,
+      "eval_runtime": 33.8247,
+      "eval_samples_per_second": 5.913,
+      "eval_steps_per_second": 0.384,
+      "step": 16
+    },
+    {
+      "epoch": 0.53125,
+      "grad_norm": 0.3719623439057395,
+      "learning_rate": 2e-05,
+      "loss": 1.0992,
+      "step": 17
+    },
+    {
+      "epoch": 0.53125,
+      "eval_loss": 1.0465847253799438,
+      "eval_runtime": 32.7443,
+      "eval_samples_per_second": 6.108,
+      "eval_steps_per_second": 0.397,
+      "step": 17
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.42266460981580545,
+      "learning_rate": 2e-05,
+      "loss": 1.0904,
+      "step": 18
+    },
+    {
+      "epoch": 0.5625,
+      "eval_loss": 1.0327677726745605,
+      "eval_runtime": 32.5697,
+      "eval_samples_per_second": 6.141,
+      "eval_steps_per_second": 0.399,
+      "step": 18
+    },
+    {
+      "epoch": 0.59375,
+      "grad_norm": 0.35416098431161336,
+      "learning_rate": 2e-05,
+      "loss": 1.0055,
+      "step": 19
+    },
+    {
+      "epoch": 0.59375,
+      "eval_loss": 1.019870638847351,
+      "eval_runtime": 32.6927,
+      "eval_samples_per_second": 6.118,
+      "eval_steps_per_second": 0.398,
+      "step": 19
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.3454390449296124,
+      "learning_rate": 2e-05,
+      "loss": 1.1291,
+      "step": 20
+    },
+    {
+      "epoch": 0.625,
+      "eval_loss": 1.008323311805725,
+      "eval_runtime": 32.5051,
+      "eval_samples_per_second": 6.153,
+      "eval_steps_per_second": 0.4,
+      "step": 20
+    },
+    {
+      "epoch": 0.65625,
+      "grad_norm": 0.291766075949861,
+      "learning_rate": 2e-05,
+      "loss": 1.0363,
+      "step": 21
+    },
+    {
+      "epoch": 0.65625,
+      "eval_loss": 0.9983346462249756,
+      "eval_runtime": 36.1543,
+      "eval_samples_per_second": 5.532,
+      "eval_steps_per_second": 0.36,
+      "step": 21
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.3071914269593122,
+      "learning_rate": 2e-05,
+      "loss": 1.0869,
+      "step": 22
+    },
+    {
+      "epoch": 0.6875,
+      "eval_loss": 0.989651083946228,
+      "eval_runtime": 35.9583,
+      "eval_samples_per_second": 5.562,
+      "eval_steps_per_second": 0.362,
+      "step": 22
+    },
+    {
+      "epoch": 0.71875,
+      "grad_norm": 0.2642686659789585,
+      "learning_rate": 2e-05,
+      "loss": 1.0706,
+      "step": 23
+    },
+    {
+      "epoch": 0.71875,
+      "eval_loss": 0.981977641582489,
+      "eval_runtime": 35.7624,
+      "eval_samples_per_second": 5.592,
+      "eval_steps_per_second": 0.364,
+      "step": 23
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.23789134722319716,
+      "learning_rate": 2e-05,
+      "loss": 1.0669,
+      "step": 24
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 0.9751532077789307,
+      "eval_runtime": 35.6905,
+      "eval_samples_per_second": 5.604,
+      "eval_steps_per_second": 0.364,
+      "step": 24
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 0.26302325685095884,
+      "learning_rate": 2e-05,
+      "loss": 1.0141,
+      "step": 25
+    },
+    {
+      "epoch": 0.78125,
+      "eval_loss": 0.9684178233146667,
+      "eval_runtime": 35.4693,
+      "eval_samples_per_second": 5.639,
+      "eval_steps_per_second": 0.367,
+      "step": 25
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.2406662725995088,
+      "learning_rate": 2e-05,
+      "loss": 1.0381,
+      "step": 26
+    },
+    {
+      "epoch": 0.8125,
+      "eval_loss": 0.9618947505950928,
+      "eval_runtime": 37.5325,
+      "eval_samples_per_second": 5.329,
+      "eval_steps_per_second": 0.346,
+      "step": 26
+    },
+    {
+      "epoch": 0.84375,
+      "grad_norm": 0.27899113172875245,
+      "learning_rate": 2e-05,
+      "loss": 0.9693,
+      "step": 27
+    },
+    {
+      "epoch": 0.84375,
+      "eval_loss": 0.9552007913589478,
+      "eval_runtime": 37.4006,
+      "eval_samples_per_second": 5.348,
+      "eval_steps_per_second": 0.348,
+      "step": 27
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.29303174930955905,
+      "learning_rate": 2e-05,
+      "loss": 0.9841,
+      "step": 28
+    },
+    {
+      "epoch": 0.875,
+      "eval_loss": 0.9481881856918335,
+      "eval_runtime": 37.7821,
+      "eval_samples_per_second": 5.294,
+      "eval_steps_per_second": 0.344,
+      "step": 28
+    },
+    {
+      "epoch": 0.90625,
+      "grad_norm": 0.22138226087715307,
+      "learning_rate": 2e-05,
+      "loss": 0.9959,
+      "step": 29
+    },
+    {
+      "epoch": 0.90625,
+      "eval_loss": 0.9415397644042969,
+      "eval_runtime": 37.9058,
+      "eval_samples_per_second": 5.276,
+      "eval_steps_per_second": 0.343,
+      "step": 29
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.23456101188675513,
+      "learning_rate": 2e-05,
+      "loss": 1.0351,
+      "step": 30
+    },
+    {
+      "epoch": 0.9375,
+      "eval_loss": 0.9354143738746643,
+      "eval_runtime": 37.9727,
+      "eval_samples_per_second": 5.267,
+      "eval_steps_per_second": 0.342,
+      "step": 30
+    },
+    {
+      "epoch": 0.96875,
+      "grad_norm": 0.2594838155429295,
+      "learning_rate": 2e-05,
+      "loss": 0.8741,
+      "step": 31
+    },
+    {
+      "epoch": 0.96875,
+      "eval_loss": 0.9291737079620361,
+      "eval_runtime": 37.081,
+      "eval_samples_per_second": 5.394,
+      "eval_steps_per_second": 0.351,
+      "step": 31
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.2404582058613114,
+      "learning_rate": 2e-05,
+      "loss": 0.9814,
+      "step": 32
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.9231625199317932,
+      "eval_runtime": 37.0946,
+      "eval_samples_per_second": 5.392,
+      "eval_steps_per_second": 0.35,
+      "step": 32
+    },
+    {
+      "epoch": 1.03125,
+      "grad_norm": 0.26862391186560797,
+      "learning_rate": 2e-05,
+      "loss": 1.0241,
+      "step": 33
+    },
+    {
+      "epoch": 1.03125,
+      "eval_loss": 0.917277991771698,
+      "eval_runtime": 37.1872,
+      "eval_samples_per_second": 5.378,
+      "eval_steps_per_second": 0.35,
+      "step": 33
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 0.24997341491489666,
+      "learning_rate": 2e-05,
+      "loss": 1.0296,
+      "step": 34
+    },
+    {
+      "epoch": 1.0625,
+      "eval_loss": 0.9116549491882324,
+      "eval_runtime": 30.7053,
+      "eval_samples_per_second": 6.514,
+      "eval_steps_per_second": 0.423,
+      "step": 34
+    },
+    {
+      "epoch": 1.09375,
+      "grad_norm": 0.22755062908849677,
+      "learning_rate": 2e-05,
+      "loss": 1.047,
+      "step": 35
+    },
+    {
+      "epoch": 1.09375,
+      "eval_loss": 0.9061525464057922,
+      "eval_runtime": 30.5238,
+      "eval_samples_per_second": 6.552,
+      "eval_steps_per_second": 0.426,
+      "step": 35
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.2478793998097894,
+      "learning_rate": 2e-05,
+      "loss": 1.0071,
+      "step": 36
+    },
+    {
+      "epoch": 1.125,
+      "eval_loss": 0.9007319808006287,
+      "eval_runtime": 30.4573,
+      "eval_samples_per_second": 6.567,
+      "eval_steps_per_second": 0.427,
+      "step": 36
+    },
+    {
+      "epoch": 1.15625,
+      "grad_norm": 0.2319702521014333,
+      "learning_rate": 2e-05,
+      "loss": 0.9517,
+      "step": 37
+    },
+    {
+      "epoch": 1.15625,
+      "eval_loss": 0.8955077528953552,
+      "eval_runtime": 30.6396,
+      "eval_samples_per_second": 6.528,
+      "eval_steps_per_second": 0.424,
+      "step": 37
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.26929965642782505,
+      "learning_rate": 2e-05,
+      "loss": 0.9638,
+      "step": 38
+    },
+    {
+      "epoch": 1.1875,
+      "eval_loss": 0.8906582593917847,
+      "eval_runtime": 30.5706,
+      "eval_samples_per_second": 6.542,
+      "eval_steps_per_second": 0.425,
+      "step": 38
+    },
+    {
+      "epoch": 1.21875,
+      "grad_norm": 0.25494286133089294,
+      "learning_rate": 2e-05,
+      "loss": 0.9922,
+      "step": 39
+    },
+    {
+      "epoch": 1.21875,
+      "eval_loss": 0.8858879804611206,
+      "eval_runtime": 30.2267,
+      "eval_samples_per_second": 6.617,
+      "eval_steps_per_second": 0.43,
+      "step": 39
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.2468866713698415,
+      "learning_rate": 2e-05,
+      "loss": 0.9873,
+      "step": 40
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.8811590671539307,
+      "eval_runtime": 30.1065,
+      "eval_samples_per_second": 6.643,
+      "eval_steps_per_second": 0.432,
+      "step": 40
+    },
+    {
+      "epoch": 1.28125,
+      "grad_norm": 0.2460619663724958,
+      "learning_rate": 2e-05,
+      "loss": 0.9608,
+      "step": 41
+    },
+    {
+      "epoch": 1.28125,
+      "eval_loss": 0.876426637172699,
+      "eval_runtime": 30.2618,
+      "eval_samples_per_second": 6.609,
+      "eval_steps_per_second": 0.43,
+      "step": 41
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 0.244111044045335,
+      "learning_rate": 2e-05,
+      "loss": 0.9496,
+      "step": 42
+    },
+    {
+      "epoch": 1.3125,
+      "eval_loss": 0.8720347881317139,
+      "eval_runtime": 30.2637,
+      "eval_samples_per_second": 6.609,
+      "eval_steps_per_second": 0.43,
+      "step": 42
+    },
+    {
+      "epoch": 1.34375,
+      "grad_norm": 0.24263485999072093,
+      "learning_rate": 2e-05,
+      "loss": 0.9076,
+      "step": 43
+    },
+    {
+      "epoch": 1.34375,
+      "eval_loss": 0.8677232265472412,
+      "eval_runtime": 30.0588,
+      "eval_samples_per_second": 6.654,
+      "eval_steps_per_second": 0.432,
+      "step": 43
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 0.2549786588443146,
+      "learning_rate": 2e-05,
+      "loss": 0.9291,
+      "step": 44
+    },
+    {
+      "epoch": 1.375,
+      "eval_loss": 0.864047110080719,
+      "eval_runtime": 30.3833,
+      "eval_samples_per_second": 6.583,
+      "eval_steps_per_second": 0.428,
+      "step": 44
+    },
+    {
+      "epoch": 1.40625,
+      "grad_norm": 0.27020952324959413,
+      "learning_rate": 2e-05,
+      "loss": 0.9111,
+      "step": 45
+    },
+    {
+      "epoch": 1.40625,
+      "eval_loss": 0.8608524799346924,
+      "eval_runtime": 30.284,
+      "eval_samples_per_second": 6.604,
+      "eval_steps_per_second": 0.429,
+      "step": 45
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.24108750741309573,
+      "learning_rate": 2e-05,
+      "loss": 0.8363,
+      "step": 46
+    },
+    {
+      "epoch": 1.4375,
+      "eval_loss": 0.8525222539901733,
+      "eval_runtime": 51.3231,
+      "eval_samples_per_second": 3.897,
+      "eval_steps_per_second": 0.487,
+      "step": 46
+    },
+    {
+      "epoch": 1.46875,
+      "grad_norm": 0.23963570627035977,
+      "learning_rate": 2e-05,
+      "loss": 0.9776,
+      "step": 47
+    },
+    {
+      "epoch": 1.46875,
+      "eval_loss": 0.8498736619949341,
+      "eval_runtime": 43.9039,
+      "eval_samples_per_second": 4.555,
+      "eval_steps_per_second": 0.569,
+      "step": 47
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.2738559790360609,
+      "learning_rate": 2e-05,
+      "loss": 0.9075,
+      "step": 48
+    },
+    {
+      "epoch": 1.5,
+      "eval_loss": 0.846975564956665,
+      "eval_runtime": 43.6943,
+      "eval_samples_per_second": 4.577,
+      "eval_steps_per_second": 0.572,
+      "step": 48
+    },
+    {
+      "epoch": 1.53125,
+      "grad_norm": 0.2516715524185528,
+      "learning_rate": 2e-05,
+      "loss": 0.9256,
+      "step": 49
+    },
+    {
+      "epoch": 1.53125,
+      "eval_loss": 0.8441421985626221,
+      "eval_runtime": 44.0977,
+      "eval_samples_per_second": 4.535,
+      "eval_steps_per_second": 0.567,
+      "step": 49
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.25797542568004944,
+      "learning_rate": 2e-05,
+      "loss": 0.9168,
+      "step": 50
+    },
+    {
+      "epoch": 1.5625,
+      "eval_loss": 0.8408769369125366,
+      "eval_runtime": 45.4442,
+      "eval_samples_per_second": 4.401,
+      "eval_steps_per_second": 0.55,
+      "step": 50
+    },
+    {
+      "epoch": 1.59375,
+      "grad_norm": 0.24530872900913284,
+      "learning_rate": 2e-05,
+      "loss": 0.8547,
+      "step": 51
+    },
+    {
+      "epoch": 1.59375,
+      "eval_loss": 0.8373726010322571,
+      "eval_runtime": 44.6363,
+      "eval_samples_per_second": 4.481,
+      "eval_steps_per_second": 0.56,
+      "step": 51
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 0.2549609506617865,
+      "learning_rate": 2e-05,
+      "loss": 0.979,
+      "step": 52
+    },
+    {
+      "epoch": 1.625,
+      "eval_loss": 0.8340890407562256,
+      "eval_runtime": 45.991,
+      "eval_samples_per_second": 4.349,
+      "eval_steps_per_second": 0.544,
+      "step": 52
+    },
+    {
+      "epoch": 1.65625,
+      "grad_norm": 0.24114496664848603,
+      "learning_rate": 2e-05,
+      "loss": 0.9196,
+      "step": 53
+    },
+    {
+      "epoch": 1.65625,
+      "eval_loss": 0.8311529755592346,
+      "eval_runtime": 46.0654,
+      "eval_samples_per_second": 4.342,
+      "eval_steps_per_second": 0.543,
+      "step": 53
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 0.29287872202759435,
+      "learning_rate": 2e-05,
+      "loss": 0.967,
+      "step": 54
+    },
+    {
+      "epoch": 1.6875,
+      "eval_loss": 0.8281388282775879,
+      "eval_runtime": 46.0396,
+      "eval_samples_per_second": 4.344,
+      "eval_steps_per_second": 0.543,
+      "step": 54
+    },
+    {
+      "epoch": 1.71875,
+      "grad_norm": 0.2620663114325604,
+      "learning_rate": 2e-05,
+      "loss": 0.9576,
+      "step": 55
+    },
+    {
+      "epoch": 1.71875,
+      "eval_loss": 0.8252360820770264,
+      "eval_runtime": 44.8935,
+      "eval_samples_per_second": 4.455,
+      "eval_steps_per_second": 0.557,
+      "step": 55
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.24813796796229484,
+      "learning_rate": 2e-05,
+      "loss": 0.9652,
+      "step": 56
+    },
+    {
+      "epoch": 1.75,
+      "eval_loss": 0.8228487968444824,
+      "eval_runtime": 45.9424,
+      "eval_samples_per_second": 4.353,
+      "eval_steps_per_second": 0.544,
+      "step": 56
+    },
+    {
+      "epoch": 1.78125,
+      "grad_norm": 0.25644243214043555,
+      "learning_rate": 2e-05,
+      "loss": 0.8938,
+      "step": 57
+    },
+    {
+      "epoch": 1.78125,
+      "eval_loss": 0.8202834129333496,
+      "eval_runtime": 45.4583,
+      "eval_samples_per_second": 4.4,
+      "eval_steps_per_second": 0.55,
+      "step": 57
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": 0.24429328723074778,
+      "learning_rate": 2e-05,
+      "loss": 0.9373,
+      "step": 58
+    },
+    {
+      "epoch": 1.8125,
+      "eval_loss": 0.8179032802581787,
+      "eval_runtime": 45.7499,
+      "eval_samples_per_second": 4.372,
+      "eval_steps_per_second": 0.546,
+      "step": 58
+    },
+    {
+      "epoch": 1.84375,
+      "grad_norm": 0.26226013327841075,
+      "learning_rate": 2e-05,
+      "loss": 0.8474,
+      "step": 59
+    },
+    {
+      "epoch": 1.84375,
+      "eval_loss": 0.8154602646827698,
+      "eval_runtime": 46.1391,
+      "eval_samples_per_second": 4.335,
+      "eval_steps_per_second": 0.542,
+      "step": 59
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.2581666046262149,
+      "learning_rate": 2e-05,
+      "loss": 0.8517,
+      "step": 60
+    },
+    {
+      "epoch": 1.875,
+      "eval_loss": 0.812771737575531,
+      "eval_runtime": 45.5621,
+      "eval_samples_per_second": 4.39,
+      "eval_steps_per_second": 0.549,
+      "step": 60
+    },
+    {
+      "epoch": 1.90625,
+      "grad_norm": 0.2593197258112398,
+      "learning_rate": 2e-05,
+      "loss": 0.9011,
+      "step": 61
+    },
+    {
+      "epoch": 1.90625,
+      "eval_loss": 0.810187816619873,
+      "eval_runtime": 46.0597,
+      "eval_samples_per_second": 4.342,
+      "eval_steps_per_second": 0.543,
+      "step": 61
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": 0.2899895571193183,
+      "learning_rate": 2e-05,
+      "loss": 0.9277,
+      "step": 62
+    },
+    {
+      "epoch": 1.9375,
+      "eval_loss": 0.8083757758140564,
+      "eval_runtime": 45.8079,
+      "eval_samples_per_second": 4.366,
+      "eval_steps_per_second": 0.546,
+      "step": 62
+    },
+    {
+      "epoch": 1.96875,
+      "grad_norm": 0.2759215195414453,
+      "learning_rate": 2e-05,
+      "loss": 0.772,
+      "step": 63
+    },
+    {
+      "epoch": 1.96875,
+      "eval_loss": 0.8061204552650452,
+      "eval_runtime": 47.3286,
+      "eval_samples_per_second": 4.226,
+      "eval_steps_per_second": 0.528,
+      "step": 63
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.27248680511516205,
+      "learning_rate": 2e-05,
+      "loss": 0.874,
+      "step": 64
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.8037504553794861,
+      "eval_runtime": 46.1177,
+      "eval_samples_per_second": 4.337,
+      "eval_steps_per_second": 0.542,
+      "step": 64
+    },
+    {
+      "epoch": 2.03125,
+      "grad_norm": 0.3116755816558186,
+      "learning_rate": 2e-05,
+      "loss": 0.8647,
+      "step": 65
+    },
+    {
+      "epoch": 2.03125,
+      "eval_loss": 0.8007115125656128,
+      "eval_runtime": 46.1583,
+      "eval_samples_per_second": 4.333,
+      "eval_steps_per_second": 0.542,
+      "step": 65
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": 0.273032515206887,
+      "learning_rate": 2e-05,
+      "loss": 0.8862,
+      "step": 66
+    },
+    {
+      "epoch": 2.0625,
+      "eval_loss": 0.7983976006507874,
+      "eval_runtime": 47.3469,
+      "eval_samples_per_second": 4.224,
+      "eval_steps_per_second": 0.528,
+      "step": 66
+    },
+    {
+      "epoch": 2.09375,
+      "grad_norm": 0.2925240383907651,
+      "learning_rate": 2e-05,
+      "loss": 0.8617,
+      "step": 67
+    },
+    {
+      "epoch": 2.09375,
+      "eval_loss": 0.7959001064300537,
+      "eval_runtime": 47.9208,
+      "eval_samples_per_second": 4.174,
+      "eval_steps_per_second": 0.522,
+      "step": 67
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": 0.25775933439981163,
+      "learning_rate": 2e-05,
+      "loss": 0.9269,
+      "step": 68
+    },
+    {
+      "epoch": 2.125,
+      "eval_loss": 0.7938115000724792,
+      "eval_runtime": 47.8909,
+      "eval_samples_per_second": 4.176,
+      "eval_steps_per_second": 0.522,
+      "step": 68
+    },
+    {
+      "epoch": 2.15625,
+      "grad_norm": 0.2669684013704678,
+      "learning_rate": 2e-05,
+      "loss": 0.8607,
+      "step": 69
+    },
+    {
+      "epoch": 2.15625,
+      "eval_loss": 0.7918573617935181,
+      "eval_runtime": 47.39,
+      "eval_samples_per_second": 4.22,
+      "eval_steps_per_second": 0.528,
+      "step": 69
+    },
+    {
+      "epoch": 2.1875,
+      "grad_norm": 0.312578346444957,
+      "learning_rate": 2e-05,
+      "loss": 0.8086,
+      "step": 70
+    },
+    {
+      "epoch": 2.1875,
+      "eval_loss": 0.7894810438156128,
+      "eval_runtime": 46.2927,
+      "eval_samples_per_second": 4.32,
+      "eval_steps_per_second": 0.54,
+      "step": 70
+    },
+    {
+      "epoch": 2.21875,
+      "grad_norm": 0.25622754870894693,
+      "learning_rate": 2e-05,
+      "loss": 0.8945,
+      "step": 71
+    },
+    {
+      "epoch": 2.21875,
+      "eval_loss": 0.7875316739082336,
+      "eval_runtime": 45.7617,
+      "eval_samples_per_second": 4.37,
+      "eval_steps_per_second": 0.546,
+      "step": 71
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.27025767580736354,
+      "learning_rate": 2e-05,
+      "loss": 0.815,
+      "step": 72
+    },
+    {
+      "epoch": 2.25,
+      "eval_loss": 0.7858334183692932,
+      "eval_runtime": 46.2427,
+      "eval_samples_per_second": 4.325,
+      "eval_steps_per_second": 0.541,
+      "step": 72
+    },
+    {
+      "epoch": 2.28125,
+      "grad_norm": 0.3110479115695806,
+      "learning_rate": 2e-05,
+      "loss": 0.8621,
+      "step": 73
+    },
+    {
+      "epoch": 2.28125,
+      "eval_loss": 0.7841551303863525,
+      "eval_runtime": 46.5372,
+      "eval_samples_per_second": 4.298,
+      "eval_steps_per_second": 0.537,
+      "step": 73
+    },
+    {
+      "epoch": 2.3125,
+      "grad_norm": 0.26061305588172545,
+      "learning_rate": 2e-05,
+      "loss": 0.8622,
+      "step": 74
+    },
+    {
+      "epoch": 2.3125,
+      "eval_loss": 0.7826495170593262,
+      "eval_runtime": 46.1361,
+      "eval_samples_per_second": 4.335,
+      "eval_steps_per_second": 0.542,
+      "step": 74
+    },
+    {
+      "epoch": 2.34375,
+      "grad_norm": 0.27448719719872205,
+      "learning_rate": 2e-05,
+      "loss": 0.9118,
+      "step": 75
+    },
+    {
+      "epoch": 2.34375,
+      "eval_loss": 0.7811364531517029,
+      "eval_runtime": 47.6194,
+      "eval_samples_per_second": 4.2,
+      "eval_steps_per_second": 0.525,
+      "step": 75
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": 0.27078145092639194,
+      "learning_rate": 2e-05,
+      "loss": 0.8256,
+      "step": 76
+    },
+    {
+      "epoch": 2.375,
+      "eval_loss": 0.779961109161377,
+      "eval_runtime": 46.0097,
+      "eval_samples_per_second": 4.347,
+      "eval_steps_per_second": 0.543,
+      "step": 76
+    },
+    {
+      "epoch": 2.40625,
+      "grad_norm": 0.2634646272324293,
+      "learning_rate": 2e-05,
+      "loss": 0.8774,
+      "step": 77
+    },
+    {
+      "epoch": 2.40625,
+      "eval_loss": 0.7788712978363037,
+      "eval_runtime": 46.2712,
+      "eval_samples_per_second": 4.322,
+      "eval_steps_per_second": 0.54,
+      "step": 77
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": 0.3101668401682978,
+      "learning_rate": 2e-05,
+      "loss": 0.8769,
+      "step": 78
+    },
+    {
+      "epoch": 2.4375,
+      "eval_loss": 0.7776928544044495,
+      "eval_runtime": 46.3791,
+      "eval_samples_per_second": 4.312,
+      "eval_steps_per_second": 0.539,
+      "step": 78
+    },
+    {
+      "epoch": 2.46875,
+      "grad_norm": 0.28798302574187284,
+      "learning_rate": 2e-05,
+      "loss": 0.8765,
+      "step": 79
+    },
+    {
+      "epoch": 2.46875,
+      "eval_loss": 0.7773044109344482,
+      "eval_runtime": 43.9352,
+      "eval_samples_per_second": 4.552,
+      "eval_steps_per_second": 0.569,
+      "step": 79
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.3349887736240022,
+      "learning_rate": 2e-05,
+      "loss": 0.9202,
+      "step": 80
+    },
+    {
+      "epoch": 2.5,
+      "eval_loss": 0.7766420245170593,
+      "eval_runtime": 44.0118,
+      "eval_samples_per_second": 4.544,
+      "eval_steps_per_second": 0.568,
+      "step": 80
+    },
+    {
+      "epoch": 2.53125,
+      "grad_norm": 0.3272989979927921,
+      "learning_rate": 2e-05,
+      "loss": 0.8496,
+      "step": 81
+    },
+    {
+      "epoch": 2.53125,
+      "eval_loss": 0.7754170894622803,
+      "eval_runtime": 44.5079,
+      "eval_samples_per_second": 4.494,
+      "eval_steps_per_second": 0.562,
+      "step": 81
+    },
+    {
+      "epoch": 2.5625,
+      "grad_norm": 0.2937867633662159,
+      "learning_rate": 2e-05,
+      "loss": 0.9088,
+      "step": 82
+    },
+    {
+      "epoch": 2.5625,
+      "eval_loss": 0.7740327715873718,
+      "eval_runtime": 43.7759,
+      "eval_samples_per_second": 4.569,
+      "eval_steps_per_second": 0.571,
+      "step": 82
+    },
+    {
+      "epoch": 2.59375,
+      "grad_norm": 0.3001827875228488,
+      "learning_rate": 2e-05,
+      "loss": 0.8514,
+      "step": 83
+    },
+    {
+      "epoch": 2.59375,
+      "eval_loss": 0.7725099921226501,
+      "eval_runtime": 43.9246,
+      "eval_samples_per_second": 4.553,
+      "eval_steps_per_second": 0.569,
+      "step": 83
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 0.3153202233063334,
+      "learning_rate": 2e-05,
+      "loss": 0.8232,
+      "step": 84
+    },
+    {
+      "epoch": 2.625,
+      "eval_loss": 0.7707765698432922,
+      "eval_runtime": 45.7981,
+      "eval_samples_per_second": 4.367,
+      "eval_steps_per_second": 0.546,
+      "step": 84
+    },
+    {
+      "epoch": 2.65625,
+      "grad_norm": 0.3084122812305825,
+      "learning_rate": 2e-05,
+      "loss": 0.7899,
+      "step": 85
+    },
+    {
+      "epoch": 2.65625,
+      "eval_loss": 0.7689283490180969,
+      "eval_runtime": 43.8712,
+      "eval_samples_per_second": 4.559,
+      "eval_steps_per_second": 0.57,
+      "step": 85
+    },
+    {
+      "epoch": 2.6875,
+      "grad_norm": 0.34994590801092706,
+      "learning_rate": 2e-05,
+      "loss": 0.8186,
+      "step": 86
+    },
+    {
+      "epoch": 2.6875,
+      "eval_loss": 0.7668275237083435,
+      "eval_runtime": 44.0477,
+      "eval_samples_per_second": 4.541,
+      "eval_steps_per_second": 0.568,
+      "step": 86
+    },
+    {
+      "epoch": 2.71875,
+      "grad_norm": 0.33626535961990944,
+      "learning_rate": 2e-05,
+      "loss": 0.8439,
+      "step": 87
+    },
+    {
+      "epoch": 2.71875,
+      "eval_loss": 0.7653672695159912,
+      "eval_runtime": 43.9923,
+      "eval_samples_per_second": 4.546,
+      "eval_steps_per_second": 0.568,
+      "step": 87
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.33991458856080364,
+      "learning_rate": 2e-05,
+      "loss": 0.9309,
+      "step": 88
+    },
+    {
+      "epoch": 2.75,
+      "eval_loss": 0.7641142010688782,
+      "eval_runtime": 44.018,
+      "eval_samples_per_second": 4.544,
+      "eval_steps_per_second": 0.568,
+      "step": 88
+    },
+    {
+      "epoch": 2.78125,
+      "grad_norm": 0.3212547051979476,
+      "learning_rate": 2e-05,
+      "loss": 0.8262,
+      "step": 89
+    },
+    {
+      "epoch": 2.78125,
+      "eval_loss": 0.763224720954895,
+      "eval_runtime": 43.7722,
+      "eval_samples_per_second": 4.569,
+      "eval_steps_per_second": 0.571,
+      "step": 89
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": 0.335120027091876,
+      "learning_rate": 2e-05,
+      "loss": 0.8795,
+      "step": 90
+    },
+    {
+      "epoch": 2.8125,
+      "eval_loss": 0.7624655365943909,
+      "eval_runtime": 44.1972,
+      "eval_samples_per_second": 4.525,
+      "eval_steps_per_second": 0.566,
+      "step": 90
+    },
+    {
+      "epoch": 2.84375,
+      "grad_norm": 0.33822766071160937,
+      "learning_rate": 2e-05,
+      "loss": 0.7798,
+      "step": 91
+    },
+    {
+      "epoch": 2.84375,
+      "eval_loss": 0.761708676815033,
+      "eval_runtime": 43.8244,
+      "eval_samples_per_second": 4.564,
+      "eval_steps_per_second": 0.57,
+      "step": 91
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": 0.33505853726890483,
+      "learning_rate": 2e-05,
+      "loss": 0.8715,
+      "step": 92
+    },
+    {
+      "epoch": 2.875,
+      "eval_loss": 0.7611495852470398,
+      "eval_runtime": 43.7833,
+      "eval_samples_per_second": 4.568,
+      "eval_steps_per_second": 0.571,
+      "step": 92
+    },
+    {
+      "epoch": 2.90625,
+      "grad_norm": 0.3126942865091584,
+      "learning_rate": 2e-05,
+      "loss": 0.8102,
+      "step": 93
+    },
+    {
+      "epoch": 2.90625,
+      "eval_loss": 0.7608107924461365,
+      "eval_runtime": 44.0119,
+      "eval_samples_per_second": 4.544,
+      "eval_steps_per_second": 0.568,
+      "step": 93
+    },
+    {
+      "epoch": 2.9375,
+      "grad_norm": 0.3594152593867412,
+      "learning_rate": 2e-05,
+      "loss": 0.8871,
+      "step": 94
+    },
+    {
+      "epoch": 2.9375,
+      "eval_loss": 0.7598913311958313,
+      "eval_runtime": 43.8956,
+      "eval_samples_per_second": 4.556,
+      "eval_steps_per_second": 0.57,
+      "step": 94
+    },
+    {
+      "epoch": 2.96875,
+      "grad_norm": 0.3161380007473764,
+      "learning_rate": 2e-05,
+      "loss": 0.8278,
+      "step": 95
+    },
+    {
+      "epoch": 2.96875,
+      "eval_loss": 0.7596660852432251,
+      "eval_runtime": 44.0687,
+      "eval_samples_per_second": 4.538,
+      "eval_steps_per_second": 0.567,
+      "step": 95
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.3922097294803287,
+      "learning_rate": 2e-05,
+      "loss": 0.7988,
+      "step": 96
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.7576884627342224,
+      "eval_runtime": 44.1881,
+      "eval_samples_per_second": 4.526,
+      "eval_steps_per_second": 0.566,
+      "step": 96
+    },
+    {
+      "epoch": 3.03125,
+      "grad_norm": 0.372234038126675,
+      "learning_rate": 2e-05,
+      "loss": 0.7558,
+      "step": 97
+    },
+    {
+      "epoch": 3.03125,
+      "eval_loss": 0.7546435594558716,
+      "eval_runtime": 43.8881,
+      "eval_samples_per_second": 4.557,
+      "eval_steps_per_second": 0.57,
+      "step": 97
+    },
+    {
+      "epoch": 3.0625,
+      "grad_norm": 0.3249396043376576,
+      "learning_rate": 2e-05,
+      "loss": 0.8422,
+      "step": 98
+    },
+    {
+      "epoch": 3.0625,
+      "eval_loss": 0.7515354752540588,
+      "eval_runtime": 44.5887,
+      "eval_samples_per_second": 4.485,
+      "eval_steps_per_second": 0.561,
+      "step": 98
+    },
+    {
+      "epoch": 3.09375,
+      "grad_norm": 0.3194387311297811,
+      "learning_rate": 2e-05,
+      "loss": 0.8059,
+      "step": 99
+    },
+    {
+      "epoch": 3.09375,
+      "eval_loss": 0.7486842274665833,
+      "eval_runtime": 44.0967,
+      "eval_samples_per_second": 4.535,
+      "eval_steps_per_second": 0.567,
+      "step": 99
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": 0.3434194037136213,
+      "learning_rate": 2e-05,
+      "loss": 0.8341,
+      "step": 100
+    },
+    {
+      "epoch": 3.125,
+      "eval_loss": 0.7464652061462402,
+      "eval_runtime": 44.0666,
+      "eval_samples_per_second": 4.539,
+      "eval_steps_per_second": 0.567,
+      "step": 100
+    },
+    {
+      "epoch": 3.15625,
+      "grad_norm": 0.33666008484696835,
+      "learning_rate": 2e-05,
+      "loss": 0.7731,
+      "step": 101
+    },
+    {
+      "epoch": 3.15625,
+      "eval_loss": 0.7450191378593445,
+      "eval_runtime": 44.0337,
+      "eval_samples_per_second": 4.542,
+      "eval_steps_per_second": 0.568,
+      "step": 101
+    },
+    {
+      "epoch": 3.1875,
+      "grad_norm": 0.3596265575837954,
+      "learning_rate": 2e-05,
+      "loss": 0.8354,
+      "step": 102
+    },
+    {
+      "epoch": 3.1875,
+      "eval_loss": 0.7442840337753296,
+      "eval_runtime": 44.0804,
+      "eval_samples_per_second": 4.537,
+      "eval_steps_per_second": 0.567,
+      "step": 102
+    },
+    {
+      "epoch": 3.21875,
+      "grad_norm": 0.37228869739935877,
+      "learning_rate": 2e-05,
+      "loss": 0.8476,
+      "step": 103
+    },
+    {
+      "epoch": 3.21875,
+      "eval_loss": 0.74405837059021,
+      "eval_runtime": 43.9201,
+      "eval_samples_per_second": 4.554,
+      "eval_steps_per_second": 0.569,
+      "step": 103
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.372126737706513,
+      "learning_rate": 2e-05,
+      "loss": 0.7568,
+      "step": 104
+    },
+    {
+      "epoch": 3.25,
+      "eval_loss": 0.7435027360916138,
+      "eval_runtime": 44.0105,
+      "eval_samples_per_second": 4.544,
+      "eval_steps_per_second": 0.568,
+      "step": 104
+    },
+    {
+      "epoch": 3.28125,
+      "grad_norm": 0.3362686942090606,
+      "learning_rate": 2e-05,
+      "loss": 0.8035,
+      "step": 105
+    },
+    {
+      "epoch": 3.28125,
+      "eval_loss": 0.7431904673576355,
+      "eval_runtime": 43.9113,
+      "eval_samples_per_second": 4.555,
+      "eval_steps_per_second": 0.569,
+      "step": 105
+    },
+    {
+      "epoch": 3.3125,
+      "grad_norm": 0.36392229188159225,
+      "learning_rate": 2e-05,
+      "loss": 0.8353,
+      "step": 106
+    },
+    {
+      "epoch": 3.3125,
+      "eval_loss": 0.7430496215820312,
+      "eval_runtime": 44.6371,
+      "eval_samples_per_second": 4.481,
+      "eval_steps_per_second": 0.56,
+      "step": 106
+    },
+    {
+      "epoch": 3.34375,
+      "grad_norm": 0.4471327905090859,
+      "learning_rate": 2e-05,
+      "loss": 0.7363,
+      "step": 107
+    },
+    {
+      "epoch": 3.34375,
+      "eval_loss": 0.7411425709724426,
+      "eval_runtime": 44.7094,
+      "eval_samples_per_second": 4.473,
+      "eval_steps_per_second": 0.559,
+      "step": 107
+    },
+    {
+      "epoch": 3.375,
+      "grad_norm": 0.3716356236311949,
+      "learning_rate": 2e-05,
+      "loss": 0.7774,
+      "step": 108
+    },
+    {
+      "epoch": 3.375,
+      "eval_loss": 0.7391970753669739,
+      "eval_runtime": 44.6877,
+      "eval_samples_per_second": 4.476,
+      "eval_steps_per_second": 0.559,
+      "step": 108
+    },
+    {
+      "epoch": 3.40625,
+      "grad_norm": 0.39848151618324823,
+      "learning_rate": 2e-05,
+      "loss": 0.766,
+      "step": 109
+    },
+    {
+      "epoch": 3.40625,
+      "eval_loss": 0.7370663285255432,
+      "eval_runtime": 44.7716,
+      "eval_samples_per_second": 4.467,
+      "eval_steps_per_second": 0.558,
+      "step": 109
+    },
+    {
+      "epoch": 3.4375,
+      "grad_norm": 0.3979613694284285,
+      "learning_rate": 2e-05,
+      "loss": 0.7647,
+      "step": 110
+    },
+    {
+      "epoch": 3.4375,
+      "eval_loss": 0.7347142100334167,
+      "eval_runtime": 46.1551,
+      "eval_samples_per_second": 4.333,
+      "eval_steps_per_second": 0.542,
+      "step": 110
+    },
+    {
+      "epoch": 3.46875,
+      "grad_norm": 0.4005021474949748,
+      "learning_rate": 2e-05,
+      "loss": 0.8363,
+      "step": 111
+    },
+    {
+      "epoch": 3.46875,
+      "eval_loss": 0.7330761551856995,
+      "eval_runtime": 45.4921,
+      "eval_samples_per_second": 4.396,
+      "eval_steps_per_second": 0.55,
+      "step": 111
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.3814831442952738,
+      "learning_rate": 2e-05,
+      "loss": 0.8172,
+      "step": 112
+    },
+    {
+      "epoch": 3.5,
+      "eval_loss": 0.7321842908859253,
+      "eval_runtime": 46.3117,
+      "eval_samples_per_second": 4.319,
+      "eval_steps_per_second": 0.54,
+      "step": 112
+    },
+    {
+      "epoch": 3.53125,
+      "grad_norm": 0.37084330088188894,
+      "learning_rate": 2e-05,
+      "loss": 0.8984,
+      "step": 113
+    },
+    {
+      "epoch": 3.53125,
+      "eval_loss": 0.7323736548423767,
+      "eval_runtime": 45.7394,
+      "eval_samples_per_second": 4.373,
+      "eval_steps_per_second": 0.547,
+      "step": 113
+    },
+    {
+      "epoch": 3.5625,
+      "grad_norm": 0.4074607742772961,
+      "learning_rate": 2e-05,
+      "loss": 0.7623,
+      "step": 114
+    },
+    {
+      "epoch": 3.5625,
+      "eval_loss": 0.7331156134605408,
+      "eval_runtime": 47.2117,
+      "eval_samples_per_second": 4.236,
+      "eval_steps_per_second": 0.53,
+      "step": 114
+    },
+    {
+      "epoch": 3.59375,
+      "grad_norm": 0.3478981526620727,
+      "learning_rate": 2e-05,
+      "loss": 0.8294,
+      "step": 115
+    },
+    {
+      "epoch": 3.59375,
+      "eval_loss": 0.7339057326316833,
+      "eval_runtime": 45.3783,
+      "eval_samples_per_second": 4.407,
+      "eval_steps_per_second": 0.551,
+      "step": 115
+    },
+    {
+      "epoch": 3.625,
+      "grad_norm": 0.4015868947675386,
+      "learning_rate": 2e-05,
+      "loss": 0.8,
+      "step": 116
+    },
+    {
+      "epoch": 3.625,
+      "eval_loss": 0.7341201305389404,
+      "eval_runtime": 45.9888,
+      "eval_samples_per_second": 4.349,
+      "eval_steps_per_second": 0.544,
+      "step": 116
+    },
+    {
+      "epoch": 3.65625,
+      "grad_norm": 0.3908261734781783,
+      "learning_rate": 2e-05,
+      "loss": 0.7903,
+      "step": 117
+    },
+    {
+      "epoch": 3.65625,
+      "eval_loss": 0.7336520552635193,
+      "eval_runtime": 45.9012,
+      "eval_samples_per_second": 4.357,
+      "eval_steps_per_second": 0.545,
+      "step": 117
+    },
+    {
+      "epoch": 3.6875,
+      "grad_norm": 0.39497646856232355,
+      "learning_rate": 2e-05,
+      "loss": 0.8072,
+      "step": 118
+    },
+    {
+      "epoch": 3.6875,
+      "eval_loss": 0.7335306406021118,
+      "eval_runtime": 46.2389,
+      "eval_samples_per_second": 4.325,
+      "eval_steps_per_second": 0.541,
+      "step": 118
+    },
+    {
+      "epoch": 3.71875,
+      "grad_norm": 0.3773137872461335,
+      "learning_rate": 2e-05,
+      "loss": 0.8647,
+      "step": 119
+    },
+    {
+      "epoch": 3.71875,
+      "eval_loss": 0.7331534028053284,
+      "eval_runtime": 46.662,
+      "eval_samples_per_second": 4.286,
+      "eval_steps_per_second": 0.536,
+      "step": 119
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.353841599712999,
+      "learning_rate": 2e-05,
+      "loss": 0.8076,
+      "step": 120
+    },
+    {
+      "epoch": 3.75,
+      "eval_loss": 0.732619047164917,
+      "eval_runtime": 47.5847,
+      "eval_samples_per_second": 4.203,
+      "eval_steps_per_second": 0.525,
+      "step": 120
+    },
+    {
+      "epoch": 3.78125,
+      "grad_norm": 0.38703604888096965,
+      "learning_rate": 2e-05,
+      "loss": 0.783,
+      "step": 121
+    },
+    {
+      "epoch": 3.78125,
+      "eval_loss": 0.7308679223060608,
+      "eval_runtime": 47.3672,
+      "eval_samples_per_second": 4.222,
+      "eval_steps_per_second": 0.528,
+      "step": 121
+    },
+    {
+      "epoch": 3.8125,
+      "grad_norm": 0.406784109988961,
+      "learning_rate": 2e-05,
+      "loss": 0.8592,
+      "step": 122
+    },
+    {
+      "epoch": 3.8125,
+      "eval_loss": 0.7294270396232605,
+      "eval_runtime": 46.3156,
+      "eval_samples_per_second": 4.318,
+      "eval_steps_per_second": 0.54,
+      "step": 122
+    },
+    {
+      "epoch": 3.84375,
+      "grad_norm": 0.3867362432665531,
+      "learning_rate": 2e-05,
+      "loss": 0.7773,
+      "step": 123
+    },
+    {
+      "epoch": 3.84375,
+      "eval_loss": 0.7278974056243896,
+      "eval_runtime": 46.0714,
+      "eval_samples_per_second": 4.341,
+      "eval_steps_per_second": 0.543,
+      "step": 123
+    },
+    {
+      "epoch": 3.875,
+      "grad_norm": 0.37454905814944983,
+      "learning_rate": 2e-05,
+      "loss": 0.8054,
+      "step": 124
+    },
+    {
+      "epoch": 3.875,
+      "eval_loss": 0.7264491319656372,
+      "eval_runtime": 46.0579,
+      "eval_samples_per_second": 4.342,
+      "eval_steps_per_second": 0.543,
+      "step": 124
+    },
+    {
+      "epoch": 3.90625,
+      "grad_norm": 0.444384159363942,
+      "learning_rate": 2e-05,
+      "loss": 0.8434,
+      "step": 125
+    },
+    {
+      "epoch": 3.90625,
+      "eval_loss": 0.7248883843421936,
+      "eval_runtime": 46.2593,
+      "eval_samples_per_second": 4.323,
+      "eval_steps_per_second": 0.54,
+      "step": 125
+    },
+    {
+      "epoch": 3.9375,
+      "grad_norm": 0.4296603454332508,
+      "learning_rate": 2e-05,
+      "loss": 0.8154,
+      "step": 126
+    },
+    {
+      "epoch": 3.9375,
+      "eval_loss": 0.7236350774765015,
+      "eval_runtime": 47.8167,
+      "eval_samples_per_second": 4.183,
+      "eval_steps_per_second": 0.523,
+      "step": 126
+    },
+    {
+      "epoch": 3.96875,
+      "grad_norm": 0.4369101294390371,
+      "learning_rate": 2e-05,
+      "loss": 0.7759,
+      "step": 127
+    },
+    {
+      "epoch": 3.96875,
+      "eval_loss": 0.7224241495132446,
+      "eval_runtime": 45.8583,
+      "eval_samples_per_second": 4.361,
+      "eval_steps_per_second": 0.545,
+      "step": 127
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.4294598409798285,
+      "learning_rate": 2e-05,
+      "loss": 0.706,
+      "step": 128
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.7210729718208313,
+      "eval_runtime": 45.9047,
+      "eval_samples_per_second": 4.357,
+      "eval_steps_per_second": 0.545,
+      "step": 128
+    },
+    {
+      "epoch": 4.03125,
+      "grad_norm": 0.355178274167416,
+      "learning_rate": 2e-05,
+      "loss": 0.7969,
+      "step": 129
+    },
+    {
+      "epoch": 4.03125,
+      "eval_loss": 0.7206510901451111,
+      "eval_runtime": 46.1016,
+      "eval_samples_per_second": 4.338,
+      "eval_steps_per_second": 0.542,
+      "step": 129
+    },
+    {
+      "epoch": 4.0625,
+      "grad_norm": 0.39855476598487416,
+      "learning_rate": 2e-05,
+      "loss": 0.8124,
+      "step": 130
+    },
+    {
+      "epoch": 4.0625,
+      "eval_loss": 0.7203733921051025,
+      "eval_runtime": 46.5052,
+      "eval_samples_per_second": 4.301,
+      "eval_steps_per_second": 0.538,
+      "step": 130
+    },
+    {
+      "epoch": 4.09375,
+      "grad_norm": 0.38252767359910733,
+      "learning_rate": 2e-05,
+      "loss": 0.8126,
+      "step": 131
+    },
+    {
+      "epoch": 4.09375,
+      "eval_loss": 0.7201277017593384,
+      "eval_runtime": 47.5144,
+      "eval_samples_per_second": 4.209,
+      "eval_steps_per_second": 0.526,
+      "step": 131
+    },
+    {
+      "epoch": 4.125,
+      "grad_norm": 0.44006887742113143,
+      "learning_rate": 2e-05,
+      "loss": 0.7706,
+      "step": 132
+    },
+    {
+      "epoch": 4.125,
+      "eval_loss": 0.7195135354995728,
+      "eval_runtime": 45.8417,
+      "eval_samples_per_second": 4.363,
+      "eval_steps_per_second": 0.545,
+      "step": 132
+    },
+    {
+      "epoch": 4.15625,
+      "grad_norm": 0.426129225179819,
+      "learning_rate": 2e-05,
+      "loss": 0.8699,
+      "step": 133
+    },
+    {
+      "epoch": 4.15625,
+      "eval_loss": 0.7189508080482483,
+      "eval_runtime": 46.2247,
+      "eval_samples_per_second": 4.327,
+      "eval_steps_per_second": 0.541,
+      "step": 133
+    },
+    {
+      "epoch": 4.1875,
+      "grad_norm": 0.4995092725647276,
+      "learning_rate": 2e-05,
+      "loss": 0.7811,
+      "step": 134
+    },
+    {
+      "epoch": 4.1875,
+      "eval_loss": 0.7180965542793274,
+      "eval_runtime": 46.4605,
+      "eval_samples_per_second": 4.305,
+      "eval_steps_per_second": 0.538,
+      "step": 134
+    },
+    {
+      "epoch": 4.21875,
+      "grad_norm": 0.42664484060733815,
+      "learning_rate": 2e-05,
+      "loss": 0.7795,
+      "step": 135
+    },
+    {
+      "epoch": 4.21875,
+      "eval_loss": 0.7173775434494019,
+      "eval_runtime": 46.1896,
+      "eval_samples_per_second": 4.33,
+      "eval_steps_per_second": 0.541,
+      "step": 135
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 0.43970733071879864,
+      "learning_rate": 2e-05,
+      "loss": 0.772,
+      "step": 136
+    },
+    {
+      "epoch": 4.25,
+      "eval_loss": 0.716987133026123,
+      "eval_runtime": 45.88,
+      "eval_samples_per_second": 4.359,
+      "eval_steps_per_second": 0.545,
+      "step": 136
+    },
+    {
+      "epoch": 4.28125,
+      "grad_norm": 0.4585774179958974,
+      "learning_rate": 2e-05,
+      "loss": 0.7594,
+      "step": 137
+    },
+    {
+      "epoch": 4.28125,
+      "eval_loss": 0.7162837386131287,
+      "eval_runtime": 45.9687,
+      "eval_samples_per_second": 4.351,
+      "eval_steps_per_second": 0.544,
+      "step": 137
+    },
+    {
+      "epoch": 4.3125,
+      "grad_norm": 0.4482018280143517,
+      "learning_rate": 2e-05,
+      "loss": 0.7702,
+      "step": 138
+    },
+    {
+      "epoch": 4.3125,
+      "eval_loss": 0.7155399918556213,
+      "eval_runtime": 46.1566,
+      "eval_samples_per_second": 4.333,
+      "eval_steps_per_second": 0.542,
+      "step": 138
+    },
+    {
+      "epoch": 4.34375,
+      "grad_norm": 0.44262087649988896,
+      "learning_rate": 2e-05,
+      "loss": 0.7323,
+      "step": 139
+    },
+    {
+      "epoch": 4.34375,
+      "eval_loss": 0.7145451307296753,
+      "eval_runtime": 46.2257,
+      "eval_samples_per_second": 4.327,
+      "eval_steps_per_second": 0.541,
+      "step": 139
+    },
+    {
+      "epoch": 4.375,
+      "grad_norm": 0.4418100350036369,
+      "learning_rate": 2e-05,
+      "loss": 0.7669,
+      "step": 140
+    },
+    {
+      "epoch": 4.375,
+      "eval_loss": 0.7139186263084412,
+      "eval_runtime": 46.1994,
+      "eval_samples_per_second": 4.329,
+      "eval_steps_per_second": 0.541,
+      "step": 140
+    },
+    {
+      "epoch": 4.40625,
+      "grad_norm": 0.4068223149751762,
+      "learning_rate": 2e-05,
+      "loss": 0.7806,
+      "step": 141
+    },
+    {
+      "epoch": 4.40625,
+      "eval_loss": 0.7134376764297485,
+      "eval_runtime": 48.1068,
+      "eval_samples_per_second": 4.157,
+      "eval_steps_per_second": 0.52,
+      "step": 141
+    },
+    {
+      "epoch": 4.4375,
+      "grad_norm": 0.4339025102618351,
+      "learning_rate": 2e-05,
+      "loss": 0.7312,
+      "step": 142
+    },
+    {
+      "epoch": 4.4375,
+      "eval_loss": 0.7134268879890442,
+      "eval_runtime": 46.8951,
+      "eval_samples_per_second": 4.265,
+      "eval_steps_per_second": 0.533,
+      "step": 142
+    },
+    {
+      "epoch": 4.46875,
+      "grad_norm": 0.45474838622605346,
+      "learning_rate": 2e-05,
+      "loss": 0.7358,
+      "step": 143
+    },
+    {
+      "epoch": 4.46875,
+      "eval_loss": 0.7131960391998291,
+      "eval_runtime": 46.8155,
+      "eval_samples_per_second": 4.272,
+      "eval_steps_per_second": 0.534,
+      "step": 143
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.4284980958119551,
+      "learning_rate": 2e-05,
+      "loss": 0.7146,
+      "step": 144
+    },
+    {
+      "epoch": 4.5,
+      "eval_loss": 0.7122372388839722,
+      "eval_runtime": 46.7899,
+      "eval_samples_per_second": 4.274,
+      "eval_steps_per_second": 0.534,
+      "step": 144
+    },
+    {
+      "epoch": 4.53125,
+      "grad_norm": 0.4679473362578349,
+      "learning_rate": 2e-05,
+      "loss": 0.8018,
+      "step": 145
+    },
+    {
+      "epoch": 4.53125,
+      "eval_loss": 0.7106640338897705,
+      "eval_runtime": 46.845,
+      "eval_samples_per_second": 4.269,
+      "eval_steps_per_second": 0.534,
+      "step": 145
+    },
+    {
+      "epoch": 4.5625,
+      "grad_norm": 0.4900067169351881,
+      "learning_rate": 2e-05,
+      "loss": 0.6884,
+      "step": 146
+    },
+    {
+      "epoch": 4.5625,
+      "eval_loss": 0.7087500095367432,
+      "eval_runtime": 47.5958,
+      "eval_samples_per_second": 4.202,
+      "eval_steps_per_second": 0.525,
+      "step": 146
+    },
+    {
+      "epoch": 4.59375,
+      "grad_norm": 0.4734076525152252,
+      "learning_rate": 2e-05,
+      "loss": 0.7491,
+      "step": 147
+    },
+    {
+      "epoch": 4.59375,
+      "eval_loss": 0.7072947025299072,
+      "eval_runtime": 48.7251,
+      "eval_samples_per_second": 4.105,
+      "eval_steps_per_second": 0.513,
+      "step": 147
+    },
+    {
+      "epoch": 4.625,
+      "grad_norm": 0.44251158400098356,
+      "learning_rate": 2e-05,
+      "loss": 0.7052,
+      "step": 148
+    },
+    {
+      "epoch": 4.625,
+      "eval_loss": 0.7068507671356201,
+      "eval_runtime": 47.7025,
+      "eval_samples_per_second": 4.193,
+      "eval_steps_per_second": 0.524,
+      "step": 148
+    },
+    {
+      "epoch": 4.65625,
+      "grad_norm": 0.4304625716692019,
+      "learning_rate": 2e-05,
+      "loss": 0.8176,
+      "step": 149
+    },
+    {
+      "epoch": 4.65625,
+      "eval_loss": 0.7074388265609741,
+      "eval_runtime": 48.6321,
+      "eval_samples_per_second": 4.113,
+      "eval_steps_per_second": 0.514,
+      "step": 149
+    },
+    {
+      "epoch": 4.6875,
+      "grad_norm": 0.5157530943388945,
+      "learning_rate": 2e-05,
+      "loss": 0.7429,
+      "step": 150
+    },
+    {
+      "epoch": 4.6875,
+      "eval_loss": 0.7071186900138855,
+      "eval_runtime": 47.9557,
+      "eval_samples_per_second": 4.171,
+      "eval_steps_per_second": 0.521,
+      "step": 150
+    },
+    {
+      "epoch": 4.71875,
+      "grad_norm": 0.5469994539610319,
+      "learning_rate": 2e-05,
+      "loss": 0.7643,
+      "step": 151
+    },
+    {
+      "epoch": 4.71875,
+      "eval_loss": 0.7050415277481079,
+      "eval_runtime": 47.5207,
+      "eval_samples_per_second": 4.209,
+      "eval_steps_per_second": 0.526,
+      "step": 151
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 0.4821891223190419,
+      "learning_rate": 2e-05,
+      "loss": 0.7795,
+      "step": 152
+    },
+    {
+      "epoch": 4.75,
+      "eval_loss": 0.7032743692398071,
+      "eval_runtime": 47.2902,
+      "eval_samples_per_second": 4.229,
+      "eval_steps_per_second": 0.529,
+      "step": 152
+    },
+    {
+      "epoch": 4.78125,
+      "grad_norm": 0.4785594997922253,
+      "learning_rate": 2e-05,
+      "loss": 0.7323,
+      "step": 153
+    },
+    {
+      "epoch": 4.78125,
+      "eval_loss": 0.7028358578681946,
+      "eval_runtime": 47.7841,
+      "eval_samples_per_second": 4.185,
+      "eval_steps_per_second": 0.523,
+      "step": 153
+    },
+    {
+      "epoch": 4.8125,
+      "grad_norm": 0.47200733754346447,
+      "learning_rate": 2e-05,
+      "loss": 0.7555,
+      "step": 154
+    },
+    {
+      "epoch": 4.8125,
+      "eval_loss": 0.7034148573875427,
+      "eval_runtime": 47.4952,
+      "eval_samples_per_second": 4.211,
+      "eval_steps_per_second": 0.526,
+      "step": 154
+    },
+    {
+      "epoch": 4.84375,
+      "grad_norm": 0.49226670914533455,
+      "learning_rate": 2e-05,
+      "loss": 0.6884,
+      "step": 155
+    },
+    {
+      "epoch": 4.84375,
+      "eval_loss": 0.7038142681121826,
+      "eval_runtime": 47.6873,
+      "eval_samples_per_second": 4.194,
+      "eval_steps_per_second": 0.524,
+      "step": 155
+    },
+    {
+      "epoch": 4.875,
+      "grad_norm": 0.4894781168701622,
+      "learning_rate": 2e-05,
+      "loss": 0.8079,
+      "step": 156
+    },
+    {
+      "epoch": 4.875,
+      "eval_loss": 0.7031099200248718,
+      "eval_runtime": 47.0438,
+      "eval_samples_per_second": 4.251,
+      "eval_steps_per_second": 0.531,
+      "step": 156
+    },
+    {
+      "epoch": 4.90625,
+      "grad_norm": 0.44465660848434874,
+      "learning_rate": 2e-05,
+      "loss": 0.7868,
+      "step": 157
+    },
+    {
+      "epoch": 4.90625,
+      "eval_loss": 0.7025811672210693,
+      "eval_runtime": 47.2897,
+      "eval_samples_per_second": 4.229,
+      "eval_steps_per_second": 0.529,
+      "step": 157
+    },
+    {
+      "epoch": 4.9375,
+      "grad_norm": 0.4671993515654777,
+      "learning_rate": 2e-05,
+      "loss": 0.7949,
+      "step": 158
+    },
+    {
+      "epoch": 4.9375,
+      "eval_loss": 0.7016230225563049,
+      "eval_runtime": 48.7147,
+      "eval_samples_per_second": 4.106,
+      "eval_steps_per_second": 0.513,
+      "step": 158
+    },
+    {
+      "epoch": 4.96875,
+      "grad_norm": 0.46593892888464733,
+      "learning_rate": 2e-05,
+      "loss": 0.7445,
+      "step": 159
+    },
+    {
+      "epoch": 4.96875,
+      "eval_loss": 0.7006258964538574,
+      "eval_runtime": 48.5723,
+      "eval_samples_per_second": 4.118,
+      "eval_steps_per_second": 0.515,
+      "step": 159
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.47383657575274585,
+      "learning_rate": 2e-05,
+      "loss": 0.7233,
+      "step": 160
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.7000269889831543,
+      "eval_runtime": 48.7517,
+      "eval_samples_per_second": 4.102,
+      "eval_steps_per_second": 0.513,
+      "step": 160
+    },
+    {
+      "epoch": 5.03125,
+      "grad_norm": 0.42723336337060835,
+      "learning_rate": 2e-05,
+      "loss": 0.7061,
+      "step": 161
+    },
+    {
+      "epoch": 5.03125,
+      "eval_loss": 0.7001045942306519,
+      "eval_runtime": 51.0355,
+      "eval_samples_per_second": 3.919,
+      "eval_steps_per_second": 0.49,
+      "step": 161
+    },
+    {
+      "epoch": 5.0625,
+      "grad_norm": 0.452950592019195,
+      "learning_rate": 2e-05,
+      "loss": 0.8489,
+      "step": 162
+    },
+    {
+      "epoch": 5.0625,
+      "eval_loss": 0.7011143565177917,
+      "eval_runtime": 44.0195,
+      "eval_samples_per_second": 4.543,
+      "eval_steps_per_second": 0.568,
+      "step": 162
+    },
+    {
+      "epoch": 5.09375,
+      "grad_norm": 0.49095068041556844,
+      "learning_rate": 2e-05,
+      "loss": 0.6523,
+      "step": 163
+    },
+    {
+      "epoch": 5.09375,
+      "eval_loss": 0.7020147442817688,
+      "eval_runtime": 43.9994,
+      "eval_samples_per_second": 4.546,
+      "eval_steps_per_second": 0.568,
+      "step": 163
+    },
+    {
+      "epoch": 5.125,
+      "grad_norm": 0.49702685752637826,
+      "learning_rate": 2e-05,
+      "loss": 0.7931,
+      "step": 164
+    },
+    {
+      "epoch": 5.125,
+      "eval_loss": 0.7026366591453552,
+      "eval_runtime": 43.7736,
+      "eval_samples_per_second": 4.569,
+      "eval_steps_per_second": 0.571,
+      "step": 164
+    },
+    {
+      "epoch": 5.15625,
+      "grad_norm": 0.5894972181165574,
+      "learning_rate": 2e-05,
+      "loss": 0.6297,
+      "step": 165
+    },
+    {
+      "epoch": 5.15625,
+      "eval_loss": 0.7018793225288391,
+      "eval_runtime": 43.8277,
+      "eval_samples_per_second": 4.563,
+      "eval_steps_per_second": 0.57,
+      "step": 165
+    },
+    {
+      "epoch": 5.1875,
+      "grad_norm": 0.5431599726243479,
+      "learning_rate": 2e-05,
+      "loss": 0.7394,
+      "step": 166
+    },
+    {
+      "epoch": 5.1875,
+      "eval_loss": 0.701405942440033,
+      "eval_runtime": 46.007,
+      "eval_samples_per_second": 4.347,
+      "eval_steps_per_second": 0.543,
+      "step": 166
+    },
+    {
+      "epoch": 5.21875,
+      "grad_norm": 0.46081080554385206,
+      "learning_rate": 2e-05,
+      "loss": 0.7587,
+      "step": 167
+    },
+    {
+      "epoch": 5.21875,
+      "eval_loss": 0.7011873126029968,
+      "eval_runtime": 45.6739,
+      "eval_samples_per_second": 4.379,
+      "eval_steps_per_second": 0.547,
+      "step": 167
+    },
+    {
+      "epoch": 5.25,
+      "grad_norm": 0.5186784959253576,
+      "learning_rate": 2e-05,
+      "loss": 0.7944,
+      "step": 168
+    },
+    {
+      "epoch": 5.25,
+      "eval_loss": 0.7006779313087463,
+      "eval_runtime": 46.6382,
+      "eval_samples_per_second": 4.288,
+      "eval_steps_per_second": 0.536,
+      "step": 168
+    },
+    {
+      "epoch": 5.28125,
+      "grad_norm": 0.484045023962852,
+      "learning_rate": 2e-05,
+      "loss": 0.7149,
+      "step": 169
+    },
+    {
+      "epoch": 5.28125,
+      "eval_loss": 0.7005323171615601,
+      "eval_runtime": 45.7584,
+      "eval_samples_per_second": 4.371,
+      "eval_steps_per_second": 0.546,
+      "step": 169
+    },
+    {
+      "epoch": 5.3125,
+      "grad_norm": 0.5719751134907255,
+      "learning_rate": 2e-05,
+      "loss": 0.6939,
+      "step": 170
+    },
+    {
+      "epoch": 5.3125,
+      "eval_loss": 0.7002266645431519,
+      "eval_runtime": 45.9679,
+      "eval_samples_per_second": 4.351,
+      "eval_steps_per_second": 0.544,
+      "step": 170
+    },
+    {
+      "epoch": 5.34375,
+      "grad_norm": 0.6060894153712378,
+      "learning_rate": 2e-05,
+      "loss": 0.7048,
+      "step": 171
+    },
+    {
+      "epoch": 5.34375,
+      "eval_loss": 0.6983186602592468,
+      "eval_runtime": 47.2598,
+      "eval_samples_per_second": 4.232,
+      "eval_steps_per_second": 0.529,
+      "step": 171
+    },
+    {
+      "epoch": 5.375,
+      "grad_norm": 0.5548499769346423,
+      "learning_rate": 2e-05,
+      "loss": 0.7881,
+      "step": 172
+    },
+    {
+      "epoch": 5.375,
+      "eval_loss": 0.6966648697853088,
+      "eval_runtime": 47.0803,
+      "eval_samples_per_second": 4.248,
+      "eval_steps_per_second": 0.531,
+      "step": 172
+    },
+    {
+      "epoch": 5.40625,
+      "grad_norm": 0.5102316819603098,
+      "learning_rate": 2e-05,
+      "loss": 0.7542,
+      "step": 173
+    },
+    {
+      "epoch": 5.40625,
+      "eval_loss": 0.6953878998756409,
+      "eval_runtime": 48.3238,
+      "eval_samples_per_second": 4.139,
+      "eval_steps_per_second": 0.517,
+      "step": 173
+    },
+    {
+      "epoch": 5.4375,
+      "grad_norm": 0.5399890621278476,
+      "learning_rate": 2e-05,
+      "loss": 0.7937,
+      "step": 174
+    },
+    {
+      "epoch": 5.4375,
+      "eval_loss": 0.69431471824646,
+      "eval_runtime": 49.2122,
+      "eval_samples_per_second": 4.064,
+      "eval_steps_per_second": 0.508,
+      "step": 174
+    },
+    {
+      "epoch": 5.46875,
+      "grad_norm": 0.5252423839534397,
+      "learning_rate": 2e-05,
+      "loss": 0.7767,
+      "step": 175
+    },
+    {
+      "epoch": 5.46875,
+      "eval_loss": 0.6944937109947205,
+      "eval_runtime": 49.0039,
+      "eval_samples_per_second": 4.081,
+      "eval_steps_per_second": 0.51,
+      "step": 175
+    },
+    {
+      "epoch": 5.5,
+      "grad_norm": 0.5422683424689886,
+      "learning_rate": 2e-05,
+      "loss": 0.7171,
+      "step": 176
+    },
+    {
+      "epoch": 5.5,
+      "eval_loss": 0.6943515539169312,
+      "eval_runtime": 48.7295,
+      "eval_samples_per_second": 4.104,
+      "eval_steps_per_second": 0.513,
+      "step": 176
+    },
+    {
+      "epoch": 5.53125,
+      "grad_norm": 0.551339022612633,
+      "learning_rate": 2e-05,
+      "loss": 0.7529,
+      "step": 177
+    },
+    {
+      "epoch": 5.53125,
+      "eval_loss": 0.6935855150222778,
+      "eval_runtime": 50.259,
+      "eval_samples_per_second": 3.979,
+      "eval_steps_per_second": 0.497,
+      "step": 177
+    },
+    {
+      "epoch": 5.5625,
+      "grad_norm": 0.5040662348893271,
+      "learning_rate": 2e-05,
+      "loss": 0.7816,
+      "step": 178
+    },
+    {
+      "epoch": 5.5625,
+      "eval_loss": 0.6929727792739868,
+      "eval_runtime": 49.9267,
+      "eval_samples_per_second": 4.006,
+      "eval_steps_per_second": 0.501,
+      "step": 178
+    },
+    {
+      "epoch": 5.59375,
+      "grad_norm": 0.538094993002792,
+      "learning_rate": 2e-05,
+      "loss": 0.6785,
+      "step": 179
+    },
+    {
+      "epoch": 5.59375,
+      "eval_loss": 0.6930323839187622,
+      "eval_runtime": 48.28,
+      "eval_samples_per_second": 4.143,
+      "eval_steps_per_second": 0.518,
+      "step": 179
+    },
+    {
+      "epoch": 5.625,
+      "grad_norm": 0.5367726605699668,
+      "learning_rate": 2e-05,
+      "loss": 0.6868,
+      "step": 180
+    },
+    {
+      "epoch": 5.625,
+      "eval_loss": 0.6928802728652954,
+      "eval_runtime": 49.8478,
+      "eval_samples_per_second": 4.012,
+      "eval_steps_per_second": 0.502,
+      "step": 180
+    },
+    {
+      "epoch": 5.65625,
+      "grad_norm": 0.5978542074838507,
+      "learning_rate": 2e-05,
+      "loss": 0.698,
+      "step": 181
+    },
+    {
+      "epoch": 5.65625,
+      "eval_loss": 0.6921787858009338,
+      "eval_runtime": 50.778,
+      "eval_samples_per_second": 3.939,
+      "eval_steps_per_second": 0.492,
+      "step": 181
+    },
+    {
+      "epoch": 5.6875,
+      "grad_norm": 0.5779173967988954,
+      "learning_rate": 2e-05,
+      "loss": 0.664,
+      "step": 182
+    },
+    {
+      "epoch": 5.6875,
+      "eval_loss": 0.6921034455299377,
+      "eval_runtime": 49.7171,
+      "eval_samples_per_second": 4.023,
+      "eval_steps_per_second": 0.503,
+      "step": 182
+    },
+    {
+      "epoch": 5.71875,
+      "grad_norm": 0.6377165996743129,
+      "learning_rate": 2e-05,
+      "loss": 0.7051,
+      "step": 183
+    },
+    {
+      "epoch": 5.71875,
+      "eval_loss": 0.6914942264556885,
+      "eval_runtime": 51.9608,
+      "eval_samples_per_second": 3.849,
+      "eval_steps_per_second": 0.481,
+      "step": 183
+    },
+    {
+      "epoch": 5.75,
+      "grad_norm": 0.6093388082076064,
+      "learning_rate": 2e-05,
+      "loss": 0.6903,
+      "step": 184
+    },
+    {
+      "epoch": 5.75,
+      "eval_loss": 0.6904594302177429,
+      "eval_runtime": 49.6144,
+      "eval_samples_per_second": 4.031,
+      "eval_steps_per_second": 0.504,
+      "step": 184
+    },
+    {
+      "epoch": 5.78125,
+      "grad_norm": 0.5987747297973711,
+      "learning_rate": 2e-05,
+      "loss": 0.7368,
+      "step": 185
+    },
+    {
+      "epoch": 5.78125,
+      "eval_loss": 0.6894869804382324,
+      "eval_runtime": 49.7122,
+      "eval_samples_per_second": 4.023,
+      "eval_steps_per_second": 0.503,
+      "step": 185
+    },
+    {
+      "epoch": 5.8125,
+      "grad_norm": 0.5914952733954625,
+      "learning_rate": 2e-05,
+      "loss": 0.7003,
+      "step": 186
+    },
+    {
+      "epoch": 5.8125,
+      "eval_loss": 0.6885225772857666,
+      "eval_runtime": 49.8474,
+      "eval_samples_per_second": 4.012,
+      "eval_steps_per_second": 0.502,
+      "step": 186
+    },
+    {
+      "epoch": 5.84375,
+      "grad_norm": 0.5641237505681922,
+      "learning_rate": 2e-05,
+      "loss": 0.7571,
+      "step": 187
+    },
+    {
+      "epoch": 5.84375,
+      "eval_loss": 0.6889610290527344,
+      "eval_runtime": 51.5925,
+      "eval_samples_per_second": 3.877,
+      "eval_steps_per_second": 0.485,
+      "step": 187
+    },
+    {
+      "epoch": 5.875,
+      "grad_norm": 0.5566285784572296,
+      "learning_rate": 2e-05,
+      "loss": 0.6882,
+      "step": 188
+    },
+    {
+      "epoch": 5.875,
+      "eval_loss": 0.6903389692306519,
+      "eval_runtime": 49.713,
+      "eval_samples_per_second": 4.023,
+      "eval_steps_per_second": 0.503,
+      "step": 188
+    },
+    {
+      "epoch": 5.90625,
+      "grad_norm": 0.5594562993560854,
+      "learning_rate": 2e-05,
+      "loss": 0.7028,
+      "step": 189
+    },
+    {
+      "epoch": 5.90625,
+      "eval_loss": 0.6911373734474182,
+      "eval_runtime": 49.929,
+      "eval_samples_per_second": 4.006,
+      "eval_steps_per_second": 0.501,
+      "step": 189
+    },
+    {
+      "epoch": 5.9375,
+      "grad_norm": 0.6114177699067616,
+      "learning_rate": 2e-05,
+      "loss": 0.7181,
+      "step": 190
+    },
+    {
+      "epoch": 5.9375,
+      "eval_loss": 0.6901592016220093,
+      "eval_runtime": 49.9032,
+      "eval_samples_per_second": 4.008,
+      "eval_steps_per_second": 0.501,
+      "step": 190
+    },
+    {
+      "epoch": 5.96875,
+      "grad_norm": 0.5564307101453613,
+      "learning_rate": 2e-05,
+      "loss": 0.7116,
+      "step": 191
+    },
+    {
+      "epoch": 5.96875,
+      "eval_loss": 0.6883879899978638,
+      "eval_runtime": 49.9457,
+      "eval_samples_per_second": 4.004,
+      "eval_steps_per_second": 0.501,
+      "step": 191
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.5242139835965315,
+      "learning_rate": 2e-05,
+      "loss": 0.6956,
+      "step": 192
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.686991274356842,
+      "eval_runtime": 51.3206,
+      "eval_samples_per_second": 3.897,
+      "eval_steps_per_second": 0.487,
+      "step": 192
+    },
+    {
+      "epoch": 6.03125,
+      "grad_norm": 0.5661038874224659,
+      "learning_rate": 2e-05,
+      "loss": 0.7667,
+      "step": 193
+    },
+    {
+      "epoch": 6.03125,
+      "eval_loss": 0.6863989233970642,
+      "eval_runtime": 50.3486,
+      "eval_samples_per_second": 3.972,
+      "eval_steps_per_second": 0.497,
+      "step": 193
+    },
+    {
+      "epoch": 6.0625,
+      "grad_norm": 0.5015705892320539,
+      "learning_rate": 2e-05,
+      "loss": 0.7289,
+      "step": 194
+    },
+    {
+      "epoch": 6.0625,
+      "eval_loss": 0.6869972348213196,
+      "eval_runtime": 51.6966,
+      "eval_samples_per_second": 3.869,
+      "eval_steps_per_second": 0.484,
+      "step": 194
+    },
+    {
+      "epoch": 6.09375,
+      "grad_norm": 0.5679476318211268,
+      "learning_rate": 2e-05,
+      "loss": 0.6595,
+      "step": 195
+    },
+    {
+      "epoch": 6.09375,
+      "eval_loss": 0.6878303289413452,
+      "eval_runtime": 44.1921,
+      "eval_samples_per_second": 4.526,
+      "eval_steps_per_second": 0.566,
+      "step": 195
+    },
+    {
+      "epoch": 6.125,
+      "grad_norm": 0.5496769650020654,
+      "learning_rate": 2e-05,
+      "loss": 0.6934,
+      "step": 196
+    },
+    {
+      "epoch": 6.125,
+      "eval_loss": 0.689085841178894,
+      "eval_runtime": 44.0432,
+      "eval_samples_per_second": 4.541,
+      "eval_steps_per_second": 0.568,
+      "step": 196
+    },
+    {
+      "epoch": 6.15625,
+      "grad_norm": 0.5761731163916711,
+      "learning_rate": 2e-05,
+      "loss": 0.7212,
+      "step": 197
+    },
+    {
+      "epoch": 6.15625,
+      "eval_loss": 0.6919547915458679,
+      "eval_runtime": 45.3631,
+      "eval_samples_per_second": 4.409,
+      "eval_steps_per_second": 0.551,
+      "step": 197
+    },
+    {
+      "epoch": 6.1875,
+      "grad_norm": 0.6093485410765964,
+      "learning_rate": 2e-05,
+      "loss": 0.8013,
+      "step": 198
+    },
+    {
+      "epoch": 6.1875,
+      "eval_loss": 0.6936098337173462,
+      "eval_runtime": 44.1956,
+      "eval_samples_per_second": 4.525,
+      "eval_steps_per_second": 0.566,
+      "step": 198
+    },
+    {
+      "epoch": 6.21875,
+      "grad_norm": 0.6670365325797192,
+      "learning_rate": 2e-05,
+      "loss": 0.666,
+      "step": 199
+    },
+    {
+      "epoch": 6.21875,
+      "eval_loss": 0.693129301071167,
+      "eval_runtime": 44.0131,
+      "eval_samples_per_second": 4.544,
+      "eval_steps_per_second": 0.568,
+      "step": 199
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.6464592274733308,
+      "learning_rate": 2e-05,
+      "loss": 0.7134,
+      "step": 200
+    },
+    {
+      "epoch": 6.25,
+      "eval_loss": 0.6912326216697693,
+      "eval_runtime": 44.0,
+      "eval_samples_per_second": 4.545,
+      "eval_steps_per_second": 0.568,
+      "step": 200
+    },
+    {
+      "epoch": 6.28125,
+      "grad_norm": 0.6088225232188101,
+      "learning_rate": 2e-05,
+      "loss": 0.7405,
+      "step": 201
+    },
+    {
+      "epoch": 6.28125,
+      "eval_loss": 0.6896650195121765,
+      "eval_runtime": 44.3194,
+      "eval_samples_per_second": 4.513,
+      "eval_steps_per_second": 0.564,
+      "step": 201
+    },
+    {
+      "epoch": 6.3125,
+      "grad_norm": 0.6638309972807995,
+      "learning_rate": 2e-05,
+      "loss": 0.6542,
+      "step": 202
+    },
+    {
+      "epoch": 6.3125,
+      "eval_loss": 0.6878445148468018,
+      "eval_runtime": 44.2101,
+      "eval_samples_per_second": 4.524,
+      "eval_steps_per_second": 0.565,
+      "step": 202
+    },
+    {
+      "epoch": 6.34375,
+      "grad_norm": 0.5632348029553863,
+      "learning_rate": 2e-05,
+      "loss": 0.7953,
+      "step": 203
+    },
+    {
+      "epoch": 6.34375,
+      "eval_loss": 0.6869116425514221,
+      "eval_runtime": 44.0039,
+      "eval_samples_per_second": 4.545,
+      "eval_steps_per_second": 0.568,
+      "step": 203
+    },
+    {
+      "epoch": 6.375,
+      "grad_norm": 0.6753158068984167,
+      "learning_rate": 2e-05,
+      "loss": 0.6369,
+      "step": 204
+    },
+    {
+      "epoch": 6.375,
+      "eval_loss": 0.6856124997138977,
+      "eval_runtime": 44.2493,
+      "eval_samples_per_second": 4.52,
+      "eval_steps_per_second": 0.565,
+      "step": 204
+    },
+    {
+      "epoch": 6.40625,
+      "grad_norm": 0.5601655147962107,
+      "learning_rate": 2e-05,
+      "loss": 0.6291,
+      "step": 205
+    },
+    {
+      "epoch": 6.40625,
+      "eval_loss": 0.685504138469696,
+      "eval_runtime": 43.9463,
+      "eval_samples_per_second": 4.551,
+      "eval_steps_per_second": 0.569,
+      "step": 205
+    },
+    {
+      "epoch": 6.4375,
+      "grad_norm": 0.6578412065562369,
+      "learning_rate": 2e-05,
+      "loss": 0.6887,
+      "step": 206
+    },
+    {
+      "epoch": 6.4375,
+      "eval_loss": 0.6858142018318176,
+      "eval_runtime": 45.1556,
+      "eval_samples_per_second": 4.429,
+      "eval_steps_per_second": 0.554,
+      "step": 206
+    },
+    {
+      "epoch": 6.46875,
+      "grad_norm": 0.6149787250576099,
+      "learning_rate": 2e-05,
+      "loss": 0.7375,
+      "step": 207
+    },
+    {
+      "epoch": 6.46875,
+      "eval_loss": 0.6860241889953613,
+      "eval_runtime": 44.9447,
+      "eval_samples_per_second": 4.45,
+      "eval_steps_per_second": 0.556,
+      "step": 207
+    },
+    {
+      "epoch": 6.5,
+      "grad_norm": 0.6674521606961297,
+      "learning_rate": 2e-05,
+      "loss": 0.6856,
+      "step": 208
+    },
+    {
+      "epoch": 6.5,
+      "eval_loss": 0.6866363286972046,
+      "eval_runtime": 44.714,
+      "eval_samples_per_second": 4.473,
+      "eval_steps_per_second": 0.559,
+      "step": 208
+    },
+    {
+      "epoch": 6.53125,
+      "grad_norm": 0.700420859386899,
+      "learning_rate": 2e-05,
+      "loss": 0.6556,
+      "step": 209
+    },
+    {
+      "epoch": 6.53125,
+      "eval_loss": 0.6870286464691162,
+      "eval_runtime": 44.8923,
+      "eval_samples_per_second": 4.455,
+      "eval_steps_per_second": 0.557,
+      "step": 209
+    },
+    {
+      "epoch": 6.5625,
+      "grad_norm": 0.6530651968630973,
+      "learning_rate": 2e-05,
+      "loss": 0.6334,
+      "step": 210
+    },
+    {
+      "epoch": 6.5625,
+      "eval_loss": 0.6872709393501282,
+      "eval_runtime": 44.7944,
+      "eval_samples_per_second": 4.465,
+      "eval_steps_per_second": 0.558,
+      "step": 210
+    },
+    {
+      "epoch": 6.59375,
+      "grad_norm": 0.695757498482456,
+      "learning_rate": 2e-05,
+      "loss": 0.6784,
+      "step": 211
+    },
+    {
+      "epoch": 6.59375,
+      "eval_loss": 0.6869171857833862,
+      "eval_runtime": 45.755,
+      "eval_samples_per_second": 4.371,
+      "eval_steps_per_second": 0.546,
+      "step": 211
+    },
+    {
+      "epoch": 6.625,
+      "grad_norm": 0.642060810781652,
+      "learning_rate": 2e-05,
+      "loss": 0.6489,
+      "step": 212
+    },
+    {
+      "epoch": 6.625,
+      "eval_loss": 0.685666024684906,
+      "eval_runtime": 46.4458,
+      "eval_samples_per_second": 4.306,
+      "eval_steps_per_second": 0.538,
+      "step": 212
+    },
+    {
+      "epoch": 6.65625,
+      "grad_norm": 0.6088750940603561,
+      "learning_rate": 2e-05,
+      "loss": 0.7216,
+      "step": 213
+    },
+    {
+      "epoch": 6.65625,
+      "eval_loss": 0.6843697428703308,
+      "eval_runtime": 46.1389,
+      "eval_samples_per_second": 4.335,
+      "eval_steps_per_second": 0.542,
+      "step": 213
+    },
+    {
+      "epoch": 6.6875,
+      "grad_norm": 0.6043945628080053,
+      "learning_rate": 2e-05,
+      "loss": 0.692,
+      "step": 214
+    },
+    {
+      "epoch": 6.6875,
+      "eval_loss": 0.6836680769920349,
+      "eval_runtime": 47.7324,
+      "eval_samples_per_second": 4.19,
+      "eval_steps_per_second": 0.524,
+      "step": 214
+    },
+    {
+      "epoch": 6.71875,
+      "grad_norm": 0.6506615838970475,
+      "learning_rate": 2e-05,
+      "loss": 0.691,
+      "step": 215
+    },
+    {
+      "epoch": 6.71875,
+      "eval_loss": 0.6824812293052673,
+      "eval_runtime": 45.8056,
+      "eval_samples_per_second": 4.366,
+      "eval_steps_per_second": 0.546,
+      "step": 215
+    },
+    {
+      "epoch": 6.75,
+      "grad_norm": 0.6878268158673746,
+      "learning_rate": 2e-05,
+      "loss": 0.6894,
+      "step": 216
+    },
+    {
+      "epoch": 6.75,
+      "eval_loss": 0.6817054748535156,
+      "eval_runtime": 46.47,
+      "eval_samples_per_second": 4.304,
+      "eval_steps_per_second": 0.538,
+      "step": 216
+    },
+    {
+      "epoch": 6.78125,
+      "grad_norm": 0.6793999118325932,
+      "learning_rate": 2e-05,
+      "loss": 0.6394,
+      "step": 217
+    },
+    {
+      "epoch": 6.78125,
+      "eval_loss": 0.6831635236740112,
+      "eval_runtime": 47.8532,
+      "eval_samples_per_second": 4.179,
+      "eval_steps_per_second": 0.522,
+      "step": 217
+    },
+    {
+      "epoch": 6.8125,
+      "grad_norm": 0.6935365262523343,
+      "learning_rate": 2e-05,
+      "loss": 0.6341,
+      "step": 218
+    },
+    {
+      "epoch": 6.8125,
+      "eval_loss": 0.6843095421791077,
+      "eval_runtime": 46.3828,
+      "eval_samples_per_second": 4.312,
+      "eval_steps_per_second": 0.539,
+      "step": 218
+    },
+    {
+      "epoch": 6.84375,
+      "grad_norm": 0.8071019513751874,
+      "learning_rate": 2e-05,
+      "loss": 0.7211,
+      "step": 219
+    },
+    {
+      "epoch": 6.84375,
+      "eval_loss": 0.6839814782142639,
+      "eval_runtime": 46.5771,
+      "eval_samples_per_second": 4.294,
+      "eval_steps_per_second": 0.537,
+      "step": 219
+    },
+    {
+      "epoch": 6.875,
+      "grad_norm": 0.7202535741704769,
+      "learning_rate": 2e-05,
+      "loss": 0.7305,
+      "step": 220
+    },
+    {
+      "epoch": 6.875,
+      "eval_loss": 0.6822354197502136,
+      "eval_runtime": 46.6149,
+      "eval_samples_per_second": 4.29,
+      "eval_steps_per_second": 0.536,
+      "step": 220
+    },
+    {
+      "epoch": 6.90625,
+      "grad_norm": 0.6829442890004696,
+      "learning_rate": 2e-05,
+      "loss": 0.6965,
+      "step": 221
+    },
+    {
+      "epoch": 6.90625,
+      "eval_loss": 0.6804749369621277,
+      "eval_runtime": 47.9027,
+      "eval_samples_per_second": 4.175,
+      "eval_steps_per_second": 0.522,
+      "step": 221
+    },
+    {
+      "epoch": 6.9375,
+      "grad_norm": 0.7007337811403486,
+      "learning_rate": 2e-05,
+      "loss": 0.6948,
+      "step": 222
+    },
+    {
+      "epoch": 6.9375,
+      "eval_loss": 0.6785742044448853,
+      "eval_runtime": 48.3484,
+      "eval_samples_per_second": 4.137,
+      "eval_steps_per_second": 0.517,
+      "step": 222
+    },
+    {
+      "epoch": 6.96875,
+      "grad_norm": 0.6672225040660534,
+      "learning_rate": 2e-05,
+      "loss": 0.7075,
+      "step": 223
+    },
+    {
+      "epoch": 6.96875,
+      "eval_loss": 0.6771878004074097,
+      "eval_runtime": 46.3836,
+      "eval_samples_per_second": 4.312,
+      "eval_steps_per_second": 0.539,
+      "step": 223
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.6893374424350143,
+      "learning_rate": 2e-05,
+      "loss": 0.7652,
+      "step": 224
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.6772673726081848,
+      "eval_runtime": 47.0913,
+      "eval_samples_per_second": 4.247,
+      "eval_steps_per_second": 0.531,
+      "step": 224
+    },
+    {
+      "epoch": 7.03125,
+      "grad_norm": 0.5866908507437849,
+      "learning_rate": 2e-05,
+      "loss": 0.6784,
+      "step": 225
+    },
+    {
+      "epoch": 7.03125,
+      "eval_loss": 0.6778077483177185,
+      "eval_runtime": 46.7766,
+      "eval_samples_per_second": 4.276,
+      "eval_steps_per_second": 0.534,
+      "step": 225
+    },
+    {
+      "epoch": 7.0625,
+      "grad_norm": 0.6620785641323407,
+      "learning_rate": 2e-05,
+      "loss": 0.6107,
+      "step": 226
+    },
+    {
+      "epoch": 7.0625,
+      "eval_loss": 0.6797336339950562,
+      "eval_runtime": 47.0779,
+      "eval_samples_per_second": 4.248,
+      "eval_steps_per_second": 0.531,
+      "step": 226
+    },
+    {
+      "epoch": 7.09375,
+      "grad_norm": 0.6646660025868149,
+      "learning_rate": 2e-05,
+      "loss": 0.6824,
+      "step": 227
+    },
+    {
+      "epoch": 7.09375,
+      "eval_loss": 0.6831703186035156,
+      "eval_runtime": 46.4223,
+      "eval_samples_per_second": 4.308,
+      "eval_steps_per_second": 0.539,
+      "step": 227
+    },
+    {
+      "epoch": 7.125,
+      "grad_norm": 0.7653429329219695,
+      "learning_rate": 2e-05,
+      "loss": 0.6289,
+      "step": 228
+    },
+    {
+      "epoch": 7.125,
+      "eval_loss": 0.6889806985855103,
+      "eval_runtime": 48.2668,
+      "eval_samples_per_second": 4.144,
+      "eval_steps_per_second": 0.518,
+      "step": 228
+    },
+    {
+      "epoch": 7.15625,
+      "grad_norm": 0.888507299589656,
+      "learning_rate": 2e-05,
+      "loss": 0.6405,
+      "step": 229
+    },
+    {
+      "epoch": 7.15625,
+      "eval_loss": 0.6938297748565674,
+      "eval_runtime": 48.2833,
+      "eval_samples_per_second": 4.142,
+      "eval_steps_per_second": 0.518,
+      "step": 229
+    },
+    {
+      "epoch": 7.1875,
+      "grad_norm": 0.8483995966585272,
+      "learning_rate": 2e-05,
+      "loss": 0.6256,
+      "step": 230
+    },
+    {
+      "epoch": 7.1875,
+      "eval_loss": 0.6941313147544861,
+      "eval_runtime": 46.6028,
+      "eval_samples_per_second": 4.292,
+      "eval_steps_per_second": 0.536,
+      "step": 230
+    },
+    {
+      "epoch": 7.21875,
+      "grad_norm": 0.8529011065789557,
+      "learning_rate": 2e-05,
+      "loss": 0.719,
+      "step": 231
+    },
+    {
+      "epoch": 7.21875,
+      "eval_loss": 0.6908813714981079,
+      "eval_runtime": 47.7668,
+      "eval_samples_per_second": 4.187,
+      "eval_steps_per_second": 0.523,
+      "step": 231
+    },
+    {
+      "epoch": 7.25,
+      "grad_norm": 0.7891947191711363,
+      "learning_rate": 2e-05,
+      "loss": 0.7122,
+      "step": 232
+    },
+    {
+      "epoch": 7.25,
+      "eval_loss": 0.6873031854629517,
+      "eval_runtime": 46.9441,
+      "eval_samples_per_second": 4.26,
+      "eval_steps_per_second": 0.533,
+      "step": 232
+    },
+    {
+      "epoch": 7.28125,
+      "grad_norm": 0.8410831266636205,
+      "learning_rate": 2e-05,
+      "loss": 0.6655,
+      "step": 233
+    },
+    {
+      "epoch": 7.28125,
+      "eval_loss": 0.6842228174209595,
+      "eval_runtime": 48.184,
+      "eval_samples_per_second": 4.151,
+      "eval_steps_per_second": 0.519,
+      "step": 233
+    },
+    {
+      "epoch": 7.3125,
+      "grad_norm": 0.7543966645145809,
+      "learning_rate": 2e-05,
+      "loss": 0.702,
+      "step": 234
+    },
+    {
+      "epoch": 7.3125,
+      "eval_loss": 0.6826092600822449,
+      "eval_runtime": 48.7587,
+      "eval_samples_per_second": 4.102,
+      "eval_steps_per_second": 0.513,
+      "step": 234
+    },
+    {
+      "epoch": 7.34375,
+      "grad_norm": 0.69863349246919,
+      "learning_rate": 2e-05,
+      "loss": 0.6676,
+      "step": 235
+    },
+    {
+      "epoch": 7.34375,
+      "eval_loss": 0.6820936799049377,
+      "eval_runtime": 46.5095,
+      "eval_samples_per_second": 4.3,
+      "eval_steps_per_second": 0.538,
+      "step": 235
+    },
+    {
+      "epoch": 7.375,
+      "grad_norm": 0.7718198795174328,
+      "learning_rate": 2e-05,
+      "loss": 0.6322,
+      "step": 236
+    },
+    {
+      "epoch": 7.375,
+      "eval_loss": 0.681590735912323,
+      "eval_runtime": 47.6491,
+      "eval_samples_per_second": 4.197,
+      "eval_steps_per_second": 0.525,
+      "step": 236
+    },
+    {
+      "epoch": 7.40625,
+      "grad_norm": 0.8032644336352275,
+      "learning_rate": 2e-05,
+      "loss": 0.6835,
+      "step": 237
+    },
+    {
+      "epoch": 7.40625,
+      "eval_loss": 0.6806458234786987,
+      "eval_runtime": 47.1412,
+      "eval_samples_per_second": 4.243,
+      "eval_steps_per_second": 0.53,
+      "step": 237
+    },
+    {
+      "epoch": 7.4375,
+      "grad_norm": 0.8165151350063435,
+      "learning_rate": 2e-05,
+      "loss": 0.6744,
+      "step": 238
+    },
+    {
+      "epoch": 7.4375,
+      "eval_loss": 0.6802331805229187,
+      "eval_runtime": 48.2476,
+      "eval_samples_per_second": 4.145,
+      "eval_steps_per_second": 0.518,
+      "step": 238
+    },
+    {
+      "epoch": 7.46875,
+      "grad_norm": 0.7665175082054141,
+      "learning_rate": 2e-05,
+      "loss": 0.6955,
+      "step": 239
+    },
+    {
+      "epoch": 7.46875,
+      "eval_loss": 0.6806652545928955,
+      "eval_runtime": 46.6541,
+      "eval_samples_per_second": 4.287,
+      "eval_steps_per_second": 0.536,
+      "step": 239
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.7584547487112137,
+      "learning_rate": 2e-05,
+      "loss": 0.6374,
+      "step": 240
+    },
+    {
+      "epoch": 7.5,
+      "eval_loss": 0.6825945973396301,
+      "eval_runtime": 46.3848,
+      "eval_samples_per_second": 4.312,
+      "eval_steps_per_second": 0.539,
+      "step": 240
+    },
+    {
+      "epoch": 7.53125,
+      "grad_norm": 0.660822695597991,
+      "learning_rate": 2e-05,
+      "loss": 0.6825,
+      "step": 241
+    },
+    {
+      "epoch": 7.53125,
+      "eval_loss": 0.6861986517906189,
+      "eval_runtime": 46.2732,
+      "eval_samples_per_second": 4.322,
+      "eval_steps_per_second": 0.54,
+      "step": 241
+    },
+    {
+      "epoch": 7.5625,
+      "grad_norm": 0.7793836425815985,
+      "learning_rate": 2e-05,
+      "loss": 0.6824,
+      "step": 242
+    },
+    {
+      "epoch": 7.5625,
+      "eval_loss": 0.6895106434822083,
+      "eval_runtime": 46.6462,
+      "eval_samples_per_second": 4.288,
+      "eval_steps_per_second": 0.536,
+      "step": 242
+    },
+    {
+      "epoch": 7.59375,
+      "grad_norm": 0.8237113294656135,
+      "learning_rate": 2e-05,
+      "loss": 0.6604,
+      "step": 243
+    },
+    {
+      "epoch": 7.59375,
+      "eval_loss": 0.6898853778839111,
+      "eval_runtime": 46.7904,
+      "eval_samples_per_second": 4.274,
+      "eval_steps_per_second": 0.534,
+      "step": 243
+    },
+    {
+      "epoch": 7.625,
+      "grad_norm": 0.9966126829271594,
+      "learning_rate": 2e-05,
+      "loss": 0.7297,
+      "step": 244
+    },
+    {
+      "epoch": 7.625,
+      "eval_loss": 0.6854925751686096,
+      "eval_runtime": 46.5541,
+      "eval_samples_per_second": 4.296,
+      "eval_steps_per_second": 0.537,
+      "step": 244
+    },
+    {
+      "epoch": 7.65625,
+      "grad_norm": 0.7581680879353856,
+      "learning_rate": 2e-05,
+      "loss": 0.6319,
+      "step": 245
+    },
+    {
+      "epoch": 7.65625,
+      "eval_loss": 0.6836807131767273,
+      "eval_runtime": 48.3404,
+      "eval_samples_per_second": 4.137,
+      "eval_steps_per_second": 0.517,
+      "step": 245
+    },
+    {
+      "epoch": 7.6875,
+      "grad_norm": 0.799947909805063,
+      "learning_rate": 2e-05,
+      "loss": 0.672,
+      "step": 246
+    },
+    {
+      "epoch": 7.6875,
+      "eval_loss": 0.681761622428894,
+      "eval_runtime": 50.0597,
+      "eval_samples_per_second": 3.995,
+      "eval_steps_per_second": 0.499,
+      "step": 246
+    },
+    {
+      "epoch": 7.71875,
+      "grad_norm": 0.8377626405796506,
+      "learning_rate": 2e-05,
+      "loss": 0.6727,
+      "step": 247
+    },
+    {
+      "epoch": 7.71875,
+      "eval_loss": 0.6791908144950867,
+      "eval_runtime": 49.25,
+      "eval_samples_per_second": 4.061,
+      "eval_steps_per_second": 0.508,
+      "step": 247
+    },
+    {
+      "epoch": 7.75,
+      "grad_norm": 0.7237789197029182,
+      "learning_rate": 2e-05,
+      "loss": 0.6576,
+      "step": 248
+    },
+    {
+      "epoch": 7.75,
+      "eval_loss": 0.6767004132270813,
+      "eval_runtime": 48.5162,
+      "eval_samples_per_second": 4.122,
+      "eval_steps_per_second": 0.515,
+      "step": 248
+    },
+    {
+      "epoch": 7.78125,
+      "grad_norm": 0.7946831722044173,
+      "learning_rate": 2e-05,
+      "loss": 0.7029,
+      "step": 249
+    },
+    {
+      "epoch": 7.78125,
+      "eval_loss": 0.675483763217926,
+      "eval_runtime": 49.9932,
+      "eval_samples_per_second": 4.001,
+      "eval_steps_per_second": 0.5,
+      "step": 249
+    },
+    {
+      "epoch": 7.8125,
+      "grad_norm": 0.7259305030593936,
+      "learning_rate": 2e-05,
+      "loss": 0.7109,
+      "step": 250
+    },
+    {
+      "epoch": 7.8125,
+      "eval_loss": 0.6768932938575745,
+      "eval_runtime": 49.852,
+      "eval_samples_per_second": 4.012,
+      "eval_steps_per_second": 0.501,
+      "step": 250
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 256,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 5,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 323774769004544.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

llava-v1.6-vicuna-7b/checkpoint-250/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea8611394d63bbf39faf873e11963832cd01e9cc120193562f724dc8f437c23b
+size 8248

llava-v1.6-vicuna-7b/checkpoint-250/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

llava-v1.6-vicuna-7b/checkpoint-320/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: liuhaotian/llava-v1.6-vicuna-7b
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

llava-v1.6-vicuna-7b/checkpoint-320/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

llava-v1.6-vicuna-7b/checkpoint-320/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4313b2ea485203d6fba1b8b80e837fc73325327f1f1f16fd39522be0b1da358c
+size 42421336

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b660c176ff036e1ecc58bfcfeae750716f314fbdef0ec57ac00a32dabc17f256
+size 663858

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3eee39e8847c82b78f31b57b8d84784bbacd842f2b4d1858e74d6d91b3c333c7
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7bf3add73552d5025f0487dd677774f926967ca02fc108fbef5c2473dc1ef74
+size 663858

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8008a5a3f6fa99449d364364055d849bc255f8fe9025193a109095b3a70d6a7
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7a72445f646f627cc5cbfe9d51d86e7d78a8be9e42bde9660900382e9888907
+size 663858

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d007cd1e6b6a49fcec0bed80ffe4cddf535ecaa9473f9bf211e6c79b931e119
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f66849ad6c27fe564bd58ec87848fd0164e25403ca9da48d69fdcc9798ef396f
+size 663858

llava-v1.6-vicuna-7b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a93a75be323db17c5e5ff3951975403ebb597fdf2a9e235f408c197203e4dbd8
+size 126447597

llava-v1.6-vicuna-7b/checkpoint-320/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step320

llava-v1.6-vicuna-7b/checkpoint-320/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cad37dced9658289185bb66d80e38cc3d7678aa17da9cb91f735b159161af7e
+size 14960

llava-v1.6-vicuna-7b/checkpoint-320/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c415fda9f147269c953b761d51e95cffbd238d1fca03960a4e9750a33b27c8f7
+size 14960

llava-v1.6-vicuna-7b/checkpoint-320/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd1022d43464067191ecca4286fd1d603c7c279203f15e4a7d4c5f31930e3675
+size 14960

llava-v1.6-vicuna-7b/checkpoint-320/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6996376d0f47446f767340c6749d456b3ebbe770f49ec176c9f7441e988720c
+size 14960

llava-v1.6-vicuna-7b/checkpoint-320/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llava-v1.6-vicuna-7b/checkpoint-320/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llava-v1.6-vicuna-7b/checkpoint-320/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

llava-v1.6-vicuna-7b/checkpoint-320/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llava-v1.6-vicuna-7b/checkpoint-320/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eea5b87d44dd458ed50657a8bb3e49c499ba27787b87233c867be9d754a4078
+size 8248

llava-v1.6-vicuna-7b/checkpoint-320/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

llava-v1.6-vicuna-7b/config.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "liuhaotian/llava-v1.6-vicuna-7b",
+  "architectures": [
+    "LlavaLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "freeze_mm_mlp_adapter": false,
+  "freeze_mm_vision_resampler": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "image_crop_resolution": 224,
+  "image_grid_pinpoints": [
+    [
+      336,
+      672
+    ],
+    [
+      672,
+      336
+    ],
+    [
+      672,
+      672
+    ],
+    [
+      1008,
+      336
+    ],
+    [
+      336,
+      1008
+    ]
+  ],
+  "image_split_resolution": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "mm_hidden_size": 1024,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": 2e-05,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_resampler_type": null,
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+  "mm_vision_tower_lr": 2e-06,
+  "model_type": "llava_llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "tune_mm_mlp_adapter": false,
+  "tune_mm_vision_resampler": false,
+  "unfreeze_mm_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vocab_size": 32000
+}

llava-v1.6-vicuna-7b/non_lora_trainables.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f91e7376b38b4898ee030d0d2d4cc3f0bc12b54dd1a5ea96ec8a81fbd93ede09
+size 41961648

llava-v1.6-vicuna-7b/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1e26f3db90c35ad4d4a612238fe53fb32f8c1bebaaa3c1e2dff52289e6f653d
+size 126446114