Upload 11 files

Browse files

Files changed (11) hide show

README.md +202 -0
adapter_config.json +35 -0
adapter_model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +42 -0
trainer_state.json +942 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-v0.1
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "lm_head",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7eafcbc0089aed2f7e768fcdfb94e5cad7c85d830deda9edc95e65c910a3bb6e
+size 694431312

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a939a1db32129f5fdcd5d4b0297448b7b0c7893c051ea3ba917d15999b155a1d
+size 340434810

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffbbf1b6cf3dfe593d4df98867b126284b77f8a7fd6b324b295df656d3aa0125
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84b0665c36ae581687c7b4e8d9d2bde38129814b2025110f7047eb4135a81ab1
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,942 @@

+{
+  "best_metric": 1.6187845468521118,
+  "best_model_checkpoint": "./Sustainability_model/checkpoint-2000",
+  "epoch": 1.220703125,
+  "eval_steps": 100,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01220703125,
+      "grad_norm": 3.0088555812835693,
+      "learning_rate": 2e-05,
+      "loss": 2.1582,
+      "step": 25
+    },
+    {
+      "epoch": 0.0244140625,
+      "grad_norm": 5.197660446166992,
+      "learning_rate": 2e-05,
+      "loss": 2.0856,
+      "step": 50
+    },
+    {
+      "epoch": 0.03662109375,
+      "grad_norm": 3.234564781188965,
+      "learning_rate": 2e-05,
+      "loss": 1.9269,
+      "step": 75
+    },
+    {
+      "epoch": 0.048828125,
+      "grad_norm": 7.08390474319458,
+      "learning_rate": 2e-05,
+      "loss": 1.888,
+      "step": 100
+    },
+    {
+      "epoch": 0.048828125,
+      "eval_loss": 1.8261231184005737,
+      "eval_runtime": 590.9102,
+      "eval_samples_per_second": 3.468,
+      "eval_steps_per_second": 0.435,
+      "step": 100
+    },
+    {
+      "epoch": 0.06103515625,
+      "grad_norm": 3.1646361351013184,
+      "learning_rate": 2e-05,
+      "loss": 1.8649,
+      "step": 125
+    },
+    {
+      "epoch": 0.0732421875,
+      "grad_norm": 6.104555130004883,
+      "learning_rate": 2e-05,
+      "loss": 1.742,
+      "step": 150
+    },
+    {
+      "epoch": 0.08544921875,
+      "grad_norm": 2.9724113941192627,
+      "learning_rate": 2e-05,
+      "loss": 1.7567,
+      "step": 175
+    },
+    {
+      "epoch": 0.09765625,
+      "grad_norm": 6.2468791007995605,
+      "learning_rate": 2e-05,
+      "loss": 1.7452,
+      "step": 200
+    },
+    {
+      "epoch": 0.09765625,
+      "eval_loss": 1.7315690517425537,
+      "eval_runtime": 590.974,
+      "eval_samples_per_second": 3.467,
+      "eval_steps_per_second": 0.435,
+      "step": 200
+    },
+    {
+      "epoch": 0.10986328125,
+      "grad_norm": 2.97963285446167,
+      "learning_rate": 2e-05,
+      "loss": 1.6694,
+      "step": 225
+    },
+    {
+      "epoch": 0.1220703125,
+      "grad_norm": 4.771264553070068,
+      "learning_rate": 2e-05,
+      "loss": 1.6833,
+      "step": 250
+    },
+    {
+      "epoch": 0.13427734375,
+      "grad_norm": 2.825491428375244,
+      "learning_rate": 2e-05,
+      "loss": 1.6958,
+      "step": 275
+    },
+    {
+      "epoch": 0.146484375,
+      "grad_norm": 4.647068977355957,
+      "learning_rate": 2e-05,
+      "loss": 1.7428,
+      "step": 300
+    },
+    {
+      "epoch": 0.146484375,
+      "eval_loss": 1.6999598741531372,
+      "eval_runtime": 590.2857,
+      "eval_samples_per_second": 3.471,
+      "eval_steps_per_second": 0.435,
+      "step": 300
+    },
+    {
+      "epoch": 0.15869140625,
+      "grad_norm": 3.1953535079956055,
+      "learning_rate": 2e-05,
+      "loss": 1.7458,
+      "step": 325
+    },
+    {
+      "epoch": 0.1708984375,
+      "grad_norm": 5.5873799324035645,
+      "learning_rate": 2e-05,
+      "loss": 1.6244,
+      "step": 350
+    },
+    {
+      "epoch": 0.18310546875,
+      "grad_norm": 2.5425360202789307,
+      "learning_rate": 2e-05,
+      "loss": 1.6862,
+      "step": 375
+    },
+    {
+      "epoch": 0.1953125,
+      "grad_norm": 4.082971572875977,
+      "learning_rate": 2e-05,
+      "loss": 1.6836,
+      "step": 400
+    },
+    {
+      "epoch": 0.1953125,
+      "eval_loss": 1.6864606142044067,
+      "eval_runtime": 589.1989,
+      "eval_samples_per_second": 3.478,
+      "eval_steps_per_second": 0.436,
+      "step": 400
+    },
+    {
+      "epoch": 0.20751953125,
+      "grad_norm": 2.6709253787994385,
+      "learning_rate": 2e-05,
+      "loss": 1.6939,
+      "step": 425
+    },
+    {
+      "epoch": 0.2197265625,
+      "grad_norm": 5.410455703735352,
+      "learning_rate": 2e-05,
+      "loss": 1.5974,
+      "step": 450
+    },
+    {
+      "epoch": 0.23193359375,
+      "grad_norm": 2.8631389141082764,
+      "learning_rate": 2e-05,
+      "loss": 1.6609,
+      "step": 475
+    },
+    {
+      "epoch": 0.244140625,
+      "grad_norm": 3.2581229209899902,
+      "learning_rate": 2e-05,
+      "loss": 1.6251,
+      "step": 500
+    },
+    {
+      "epoch": 0.244140625,
+      "eval_loss": 1.67488431930542,
+      "eval_runtime": 589.2638,
+      "eval_samples_per_second": 3.477,
+      "eval_steps_per_second": 0.436,
+      "step": 500
+    },
+    {
+      "epoch": 0.25634765625,
+      "grad_norm": 2.8811697959899902,
+      "learning_rate": 2e-05,
+      "loss": 1.7135,
+      "step": 525
+    },
+    {
+      "epoch": 0.2685546875,
+      "grad_norm": 5.96162748336792,
+      "learning_rate": 2e-05,
+      "loss": 1.6709,
+      "step": 550
+    },
+    {
+      "epoch": 0.28076171875,
+      "grad_norm": 2.4651806354522705,
+      "learning_rate": 2e-05,
+      "loss": 1.6504,
+      "step": 575
+    },
+    {
+      "epoch": 0.29296875,
+      "grad_norm": 4.032615661621094,
+      "learning_rate": 2e-05,
+      "loss": 1.7128,
+      "step": 600
+    },
+    {
+      "epoch": 0.29296875,
+      "eval_loss": 1.668798565864563,
+      "eval_runtime": 589.1105,
+      "eval_samples_per_second": 3.478,
+      "eval_steps_per_second": 0.436,
+      "step": 600
+    },
+    {
+      "epoch": 0.30517578125,
+      "grad_norm": 2.694554328918457,
+      "learning_rate": 2e-05,
+      "loss": 1.7093,
+      "step": 625
+    },
+    {
+      "epoch": 0.3173828125,
+      "grad_norm": 4.213258743286133,
+      "learning_rate": 2e-05,
+      "loss": 1.6899,
+      "step": 650
+    },
+    {
+      "epoch": 0.32958984375,
+      "grad_norm": 2.69679594039917,
+      "learning_rate": 2e-05,
+      "loss": 1.6451,
+      "step": 675
+    },
+    {
+      "epoch": 0.341796875,
+      "grad_norm": 3.6988604068756104,
+      "learning_rate": 2e-05,
+      "loss": 1.631,
+      "step": 700
+    },
+    {
+      "epoch": 0.341796875,
+      "eval_loss": 1.662984013557434,
+      "eval_runtime": 588.5535,
+      "eval_samples_per_second": 3.481,
+      "eval_steps_per_second": 0.437,
+      "step": 700
+    },
+    {
+      "epoch": 0.35400390625,
+      "grad_norm": 2.6815237998962402,
+      "learning_rate": 2e-05,
+      "loss": 1.688,
+      "step": 725
+    },
+    {
+      "epoch": 0.3662109375,
+      "grad_norm": 5.819088459014893,
+      "learning_rate": 2e-05,
+      "loss": 1.6649,
+      "step": 750
+    },
+    {
+      "epoch": 0.37841796875,
+      "grad_norm": 2.524092674255371,
+      "learning_rate": 2e-05,
+      "loss": 1.6305,
+      "step": 775
+    },
+    {
+      "epoch": 0.390625,
+      "grad_norm": 4.0569963455200195,
+      "learning_rate": 2e-05,
+      "loss": 1.6493,
+      "step": 800
+    },
+    {
+      "epoch": 0.390625,
+      "eval_loss": 1.6568603515625,
+      "eval_runtime": 588.2081,
+      "eval_samples_per_second": 3.483,
+      "eval_steps_per_second": 0.437,
+      "step": 800
+    },
+    {
+      "epoch": 0.40283203125,
+      "grad_norm": 2.565763473510742,
+      "learning_rate": 2e-05,
+      "loss": 1.6983,
+      "step": 825
+    },
+    {
+      "epoch": 0.4150390625,
+      "grad_norm": 6.5800676345825195,
+      "learning_rate": 2e-05,
+      "loss": 1.6565,
+      "step": 850
+    },
+    {
+      "epoch": 0.42724609375,
+      "grad_norm": 2.1741669178009033,
+      "learning_rate": 2e-05,
+      "loss": 1.7585,
+      "step": 875
+    },
+    {
+      "epoch": 0.439453125,
+      "grad_norm": 3.838252305984497,
+      "learning_rate": 2e-05,
+      "loss": 1.6141,
+      "step": 900
+    },
+    {
+      "epoch": 0.439453125,
+      "eval_loss": 1.6529587507247925,
+      "eval_runtime": 588.0827,
+      "eval_samples_per_second": 3.484,
+      "eval_steps_per_second": 0.437,
+      "step": 900
+    },
+    {
+      "epoch": 0.45166015625,
+      "grad_norm": 4.486364841461182,
+      "learning_rate": 2e-05,
+      "loss": 1.6489,
+      "step": 925
+    },
+    {
+      "epoch": 0.4638671875,
+      "grad_norm": 3.693453311920166,
+      "learning_rate": 2e-05,
+      "loss": 1.6026,
+      "step": 950
+    },
+    {
+      "epoch": 0.47607421875,
+      "grad_norm": 2.4286513328552246,
+      "learning_rate": 2e-05,
+      "loss": 1.5639,
+      "step": 975
+    },
+    {
+      "epoch": 0.48828125,
+      "grad_norm": 3.9820656776428223,
+      "learning_rate": 2e-05,
+      "loss": 1.6621,
+      "step": 1000
+    },
+    {
+      "epoch": 0.48828125,
+      "eval_loss": 1.6506658792495728,
+      "eval_runtime": 588.1468,
+      "eval_samples_per_second": 3.484,
+      "eval_steps_per_second": 0.437,
+      "step": 1000
+    },
+    {
+      "epoch": 0.50048828125,
+      "grad_norm": 2.915191411972046,
+      "learning_rate": 2e-05,
+      "loss": 1.6281,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5126953125,
+      "grad_norm": 4.406491756439209,
+      "learning_rate": 2e-05,
+      "loss": 1.7108,
+      "step": 1050
+    },
+    {
+      "epoch": 0.52490234375,
+      "grad_norm": 2.6505398750305176,
+      "learning_rate": 2e-05,
+      "loss": 1.7151,
+      "step": 1075
+    },
+    {
+      "epoch": 0.537109375,
+      "grad_norm": 3.872833728790283,
+      "learning_rate": 2e-05,
+      "loss": 1.5925,
+      "step": 1100
+    },
+    {
+      "epoch": 0.537109375,
+      "eval_loss": 1.6442919969558716,
+      "eval_runtime": 588.2624,
+      "eval_samples_per_second": 3.483,
+      "eval_steps_per_second": 0.437,
+      "step": 1100
+    },
+    {
+      "epoch": 0.54931640625,
+      "grad_norm": 2.210282802581787,
+      "learning_rate": 2e-05,
+      "loss": 1.5845,
+      "step": 1125
+    },
+    {
+      "epoch": 0.5615234375,
+      "grad_norm": 3.7344298362731934,
+      "learning_rate": 2e-05,
+      "loss": 1.5994,
+      "step": 1150
+    },
+    {
+      "epoch": 0.57373046875,
+      "grad_norm": 2.3247945308685303,
+      "learning_rate": 2e-05,
+      "loss": 1.622,
+      "step": 1175
+    },
+    {
+      "epoch": 0.5859375,
+      "grad_norm": 4.974765300750732,
+      "learning_rate": 2e-05,
+      "loss": 1.6571,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5859375,
+      "eval_loss": 1.6453276872634888,
+      "eval_runtime": 588.5916,
+      "eval_samples_per_second": 3.481,
+      "eval_steps_per_second": 0.437,
+      "step": 1200
+    },
+    {
+      "epoch": 0.59814453125,
+      "grad_norm": 2.6029038429260254,
+      "learning_rate": 2e-05,
+      "loss": 1.6854,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6103515625,
+      "grad_norm": 3.8252599239349365,
+      "learning_rate": 2e-05,
+      "loss": 1.6875,
+      "step": 1250
+    },
+    {
+      "epoch": 0.62255859375,
+      "grad_norm": 2.5335938930511475,
+      "learning_rate": 2e-05,
+      "loss": 1.5917,
+      "step": 1275
+    },
+    {
+      "epoch": 0.634765625,
+      "grad_norm": 3.6627395153045654,
+      "learning_rate": 2e-05,
+      "loss": 1.6078,
+      "step": 1300
+    },
+    {
+      "epoch": 0.634765625,
+      "eval_loss": 1.638580322265625,
+      "eval_runtime": 588.7972,
+      "eval_samples_per_second": 3.48,
+      "eval_steps_per_second": 0.436,
+      "step": 1300
+    },
+    {
+      "epoch": 0.64697265625,
+      "grad_norm": 2.5015482902526855,
+      "learning_rate": 2e-05,
+      "loss": 1.6793,
+      "step": 1325
+    },
+    {
+      "epoch": 0.6591796875,
+      "grad_norm": 3.70072340965271,
+      "learning_rate": 2e-05,
+      "loss": 1.661,
+      "step": 1350
+    },
+    {
+      "epoch": 0.67138671875,
+      "grad_norm": 2.6039609909057617,
+      "learning_rate": 2e-05,
+      "loss": 1.6349,
+      "step": 1375
+    },
+    {
+      "epoch": 0.68359375,
+      "grad_norm": 3.3291618824005127,
+      "learning_rate": 2e-05,
+      "loss": 1.616,
+      "step": 1400
+    },
+    {
+      "epoch": 0.68359375,
+      "eval_loss": 1.6347644329071045,
+      "eval_runtime": 588.5837,
+      "eval_samples_per_second": 3.481,
+      "eval_steps_per_second": 0.437,
+      "step": 1400
+    },
+    {
+      "epoch": 0.69580078125,
+      "grad_norm": 2.6853315830230713,
+      "learning_rate": 2e-05,
+      "loss": 1.7087,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7080078125,
+      "grad_norm": 3.296851396560669,
+      "learning_rate": 2e-05,
+      "loss": 1.6676,
+      "step": 1450
+    },
+    {
+      "epoch": 0.72021484375,
+      "grad_norm": 2.3841185569763184,
+      "learning_rate": 2e-05,
+      "loss": 1.6212,
+      "step": 1475
+    },
+    {
+      "epoch": 0.732421875,
+      "grad_norm": 3.612088441848755,
+      "learning_rate": 2e-05,
+      "loss": 1.6473,
+      "step": 1500
+    },
+    {
+      "epoch": 0.732421875,
+      "eval_loss": 1.6339186429977417,
+      "eval_runtime": 588.3073,
+      "eval_samples_per_second": 3.483,
+      "eval_steps_per_second": 0.437,
+      "step": 1500
+    },
+    {
+      "epoch": 0.74462890625,
+      "grad_norm": 2.6555330753326416,
+      "learning_rate": 2e-05,
+      "loss": 1.6643,
+      "step": 1525
+    },
+    {
+      "epoch": 0.7568359375,
+      "grad_norm": 4.533504486083984,
+      "learning_rate": 2e-05,
+      "loss": 1.6236,
+      "step": 1550
+    },
+    {
+      "epoch": 0.76904296875,
+      "grad_norm": 2.2276220321655273,
+      "learning_rate": 2e-05,
+      "loss": 1.6783,
+      "step": 1575
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 3.533113956451416,
+      "learning_rate": 2e-05,
+      "loss": 1.6123,
+      "step": 1600
+    },
+    {
+      "epoch": 0.78125,
+      "eval_loss": 1.628023386001587,
+      "eval_runtime": 588.6386,
+      "eval_samples_per_second": 3.481,
+      "eval_steps_per_second": 0.437,
+      "step": 1600
+    },
+    {
+      "epoch": 0.79345703125,
+      "grad_norm": 2.2332117557525635,
+      "learning_rate": 2e-05,
+      "loss": 1.6795,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8056640625,
+      "grad_norm": 4.059207916259766,
+      "learning_rate": 2e-05,
+      "loss": 1.5915,
+      "step": 1650
+    },
+    {
+      "epoch": 0.81787109375,
+      "grad_norm": 2.46692156791687,
+      "learning_rate": 2e-05,
+      "loss": 1.6456,
+      "step": 1675
+    },
+    {
+      "epoch": 0.830078125,
+      "grad_norm": 3.602611780166626,
+      "learning_rate": 2e-05,
+      "loss": 1.564,
+      "step": 1700
+    },
+    {
+      "epoch": 0.830078125,
+      "eval_loss": 1.6274890899658203,
+      "eval_runtime": 588.2617,
+      "eval_samples_per_second": 3.483,
+      "eval_steps_per_second": 0.437,
+      "step": 1700
+    },
+    {
+      "epoch": 0.84228515625,
+      "grad_norm": 2.20896315574646,
+      "learning_rate": 2e-05,
+      "loss": 1.6469,
+      "step": 1725
+    },
+    {
+      "epoch": 0.8544921875,
+      "grad_norm": 4.329638481140137,
+      "learning_rate": 2e-05,
+      "loss": 1.5571,
+      "step": 1750
+    },
+    {
+      "epoch": 0.86669921875,
+      "grad_norm": 1.9945570230484009,
+      "learning_rate": 2e-05,
+      "loss": 1.6461,
+      "step": 1775
+    },
+    {
+      "epoch": 0.87890625,
+      "grad_norm": 3.428687334060669,
+      "learning_rate": 2e-05,
+      "loss": 1.6564,
+      "step": 1800
+    },
+    {
+      "epoch": 0.87890625,
+      "eval_loss": 1.6232744455337524,
+      "eval_runtime": 588.0784,
+      "eval_samples_per_second": 3.484,
+      "eval_steps_per_second": 0.437,
+      "step": 1800
+    },
+    {
+      "epoch": 0.89111328125,
+      "grad_norm": 2.5266592502593994,
+      "learning_rate": 2e-05,
+      "loss": 1.5607,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9033203125,
+      "grad_norm": 3.4067883491516113,
+      "learning_rate": 2e-05,
+      "loss": 1.6394,
+      "step": 1850
+    },
+    {
+      "epoch": 0.91552734375,
+      "grad_norm": 2.0028152465820312,
+      "learning_rate": 2e-05,
+      "loss": 1.6908,
+      "step": 1875
+    },
+    {
+      "epoch": 0.927734375,
+      "grad_norm": 2.8983733654022217,
+      "learning_rate": 2e-05,
+      "loss": 1.5646,
+      "step": 1900
+    },
+    {
+      "epoch": 0.927734375,
+      "eval_loss": 1.6202832460403442,
+      "eval_runtime": 587.8115,
+      "eval_samples_per_second": 3.486,
+      "eval_steps_per_second": 0.437,
+      "step": 1900
+    },
+    {
+      "epoch": 0.93994140625,
+      "grad_norm": 2.6408419609069824,
+      "learning_rate": 2e-05,
+      "loss": 1.5905,
+      "step": 1925
+    },
+    {
+      "epoch": 0.9521484375,
+      "grad_norm": 3.899275302886963,
+      "learning_rate": 2e-05,
+      "loss": 1.6138,
+      "step": 1950
+    },
+    {
+      "epoch": 0.96435546875,
+      "grad_norm": 2.338137149810791,
+      "learning_rate": 2e-05,
+      "loss": 1.6963,
+      "step": 1975
+    },
+    {
+      "epoch": 0.9765625,
+      "grad_norm": 3.6352951526641846,
+      "learning_rate": 2e-05,
+      "loss": 1.5849,
+      "step": 2000
+    },
+    {
+      "epoch": 0.9765625,
+      "eval_loss": 1.6187845468521118,
+      "eval_runtime": 587.8791,
+      "eval_samples_per_second": 3.485,
+      "eval_steps_per_second": 0.437,
+      "step": 2000
+    },
+    {
+      "epoch": 0.98876953125,
+      "grad_norm": 2.4254846572875977,
+      "learning_rate": 2e-05,
+      "loss": 1.6391,
+      "step": 2025
+    },
+    {
+      "epoch": 1.0009765625,
+      "grad_norm": 2.079317569732666,
+      "learning_rate": 2e-05,
+      "loss": 1.6238,
+      "step": 2050
+    },
+    {
+      "epoch": 1.01318359375,
+      "grad_norm": 2.1677002906799316,
+      "learning_rate": 2e-05,
+      "loss": 1.5543,
+      "step": 2075
+    },
+    {
+      "epoch": 1.025390625,
+      "grad_norm": 2.4266505241394043,
+      "learning_rate": 2e-05,
+      "loss": 1.4812,
+      "step": 2100
+    },
+    {
+      "epoch": 1.025390625,
+      "eval_loss": 1.6256210803985596,
+      "eval_runtime": 585.954,
+      "eval_samples_per_second": 3.497,
+      "eval_steps_per_second": 0.439,
+      "step": 2100
+    },
+    {
+      "epoch": 1.03759765625,
+      "grad_norm": 2.4697976112365723,
+      "learning_rate": 2e-05,
+      "loss": 1.5147,
+      "step": 2125
+    },
+    {
+      "epoch": 1.0498046875,
+      "grad_norm": 2.3185527324676514,
+      "learning_rate": 2e-05,
+      "loss": 1.5198,
+      "step": 2150
+    },
+    {
+      "epoch": 1.06201171875,
+      "grad_norm": 2.7304463386535645,
+      "learning_rate": 2e-05,
+      "loss": 1.5237,
+      "step": 2175
+    },
+    {
+      "epoch": 1.07421875,
+      "grad_norm": 2.616072177886963,
+      "learning_rate": 2e-05,
+      "loss": 1.5598,
+      "step": 2200
+    },
+    {
+      "epoch": 1.07421875,
+      "eval_loss": 1.623382568359375,
+      "eval_runtime": 586.1381,
+      "eval_samples_per_second": 3.496,
+      "eval_steps_per_second": 0.438,
+      "step": 2200
+    },
+    {
+      "epoch": 1.08642578125,
+      "grad_norm": 2.7308809757232666,
+      "learning_rate": 2e-05,
+      "loss": 1.5691,
+      "step": 2225
+    },
+    {
+      "epoch": 1.0986328125,
+      "grad_norm": 2.6916451454162598,
+      "learning_rate": 2e-05,
+      "loss": 1.5102,
+      "step": 2250
+    },
+    {
+      "epoch": 1.11083984375,
+      "grad_norm": 2.960580348968506,
+      "learning_rate": 2e-05,
+      "loss": 1.539,
+      "step": 2275
+    },
+    {
+      "epoch": 1.123046875,
+      "grad_norm": 2.5936009883880615,
+      "learning_rate": 2e-05,
+      "loss": 1.5657,
+      "step": 2300
+    },
+    {
+      "epoch": 1.123046875,
+      "eval_loss": 1.6226788759231567,
+      "eval_runtime": 586.4284,
+      "eval_samples_per_second": 3.494,
+      "eval_steps_per_second": 0.438,
+      "step": 2300
+    },
+    {
+      "epoch": 1.13525390625,
+      "grad_norm": 2.8930952548980713,
+      "learning_rate": 2e-05,
+      "loss": 1.4579,
+      "step": 2325
+    },
+    {
+      "epoch": 1.1474609375,
+      "grad_norm": 2.8736538887023926,
+      "learning_rate": 2e-05,
+      "loss": 1.5127,
+      "step": 2350
+    },
+    {
+      "epoch": 1.15966796875,
+      "grad_norm": 4.384296894073486,
+      "learning_rate": 2e-05,
+      "loss": 1.5988,
+      "step": 2375
+    },
+    {
+      "epoch": 1.171875,
+      "grad_norm": 2.728992223739624,
+      "learning_rate": 2e-05,
+      "loss": 1.51,
+      "step": 2400
+    },
+    {
+      "epoch": 1.171875,
+      "eval_loss": 1.6226541996002197,
+      "eval_runtime": 586.345,
+      "eval_samples_per_second": 3.495,
+      "eval_steps_per_second": 0.438,
+      "step": 2400
+    },
+    {
+      "epoch": 1.18408203125,
+      "grad_norm": 2.651820421218872,
+      "learning_rate": 2e-05,
+      "loss": 1.5226,
+      "step": 2425
+    },
+    {
+      "epoch": 1.1962890625,
+      "grad_norm": 2.717193126678467,
+      "learning_rate": 2e-05,
+      "loss": 1.4966,
+      "step": 2450
+    },
+    {
+      "epoch": 1.20849609375,
+      "grad_norm": 2.9759628772735596,
+      "learning_rate": 2e-05,
+      "loss": 1.526,
+      "step": 2475
+    },
+    {
+      "epoch": 1.220703125,
+      "grad_norm": 2.8832080364227295,
+      "learning_rate": 2e-05,
+      "loss": 1.5452,
+      "step": 2500
+    },
+    {
+      "epoch": 1.220703125,
+      "eval_loss": 1.6226392984390259,
+      "eval_runtime": 586.3744,
+      "eval_samples_per_second": 3.494,
+      "eval_steps_per_second": 0.438,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 4096,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 6,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.924062136972083e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de0b2104317241ff613fb434c0319adc80836c4f3c6e7859e72a0d7bb2a1c248
+size 5368