Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

README.md +202 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0
trainer_state.json +1043 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed929dd92c617acd4d7992e55d29830f13b015cfad300b5c34ec43450c2f845f
+size 54560368

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17bf479721d47d280bd41043527056b2c87382d387b877657f7f558192cc5efc
+size 109203770

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da36ab7162edaa976753eef071be28bcc74efa3bc88b528f89dfdc1207754a2a
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38086b6a270732e89612151792ce22f14e0bd34937f42dfe8a8017e39426ee29
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1043 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 8.0,
+  "eval_steps": 500,
+  "global_step": 146,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0547945205479452,
+      "grad_norm": 1.6586511135101318,
+      "learning_rate": 1e-06,
+      "loss": 2.2379,
+      "step": 1
+    },
+    {
+      "epoch": 0.1095890410958904,
+      "grad_norm": 1.6110830307006836,
+      "learning_rate": 1e-06,
+      "loss": 2.2933,
+      "step": 2
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 1.5261093378067017,
+      "learning_rate": 1e-06,
+      "loss": 2.2564,
+      "step": 3
+    },
+    {
+      "epoch": 0.2191780821917808,
+      "grad_norm": 1.6366506814956665,
+      "learning_rate": 1e-06,
+      "loss": 2.2794,
+      "step": 4
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 1.5530800819396973,
+      "learning_rate": 1e-06,
+      "loss": 2.2344,
+      "step": 5
+    },
+    {
+      "epoch": 0.3287671232876712,
+      "grad_norm": 1.5802958011627197,
+      "learning_rate": 1e-06,
+      "loss": 2.2363,
+      "step": 6
+    },
+    {
+      "epoch": 0.3835616438356164,
+      "grad_norm": 1.5483659505844116,
+      "learning_rate": 1e-06,
+      "loss": 2.256,
+      "step": 7
+    },
+    {
+      "epoch": 0.4383561643835616,
+      "grad_norm": 1.5273737907409668,
+      "learning_rate": 1e-06,
+      "loss": 2.2733,
+      "step": 8
+    },
+    {
+      "epoch": 0.4931506849315068,
+      "grad_norm": 1.534605860710144,
+      "learning_rate": 1e-06,
+      "loss": 2.223,
+      "step": 9
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 1.5523834228515625,
+      "learning_rate": 1e-06,
+      "loss": 2.2755,
+      "step": 10
+    },
+    {
+      "epoch": 0.6027397260273972,
+      "grad_norm": 1.5962920188903809,
+      "learning_rate": 1e-06,
+      "loss": 2.2875,
+      "step": 11
+    },
+    {
+      "epoch": 0.6575342465753424,
+      "grad_norm": 1.5564601421356201,
+      "learning_rate": 1e-06,
+      "loss": 2.2716,
+      "step": 12
+    },
+    {
+      "epoch": 0.7123287671232876,
+      "grad_norm": 1.5305095911026,
+      "learning_rate": 1e-06,
+      "loss": 2.2485,
+      "step": 13
+    },
+    {
+      "epoch": 0.7671232876712328,
+      "grad_norm": 1.4675662517547607,
+      "learning_rate": 1e-06,
+      "loss": 2.2574,
+      "step": 14
+    },
+    {
+      "epoch": 0.821917808219178,
+      "grad_norm": 1.4668537378311157,
+      "learning_rate": 1e-06,
+      "loss": 2.2226,
+      "step": 15
+    },
+    {
+      "epoch": 0.8767123287671232,
+      "grad_norm": 1.5306854248046875,
+      "learning_rate": 1e-06,
+      "loss": 2.2798,
+      "step": 16
+    },
+    {
+      "epoch": 0.9315068493150684,
+      "grad_norm": 1.5047531127929688,
+      "learning_rate": 1e-06,
+      "loss": 2.2486,
+      "step": 17
+    },
+    {
+      "epoch": 0.9863013698630136,
+      "grad_norm": 1.4622173309326172,
+      "learning_rate": 1e-06,
+      "loss": 2.217,
+      "step": 18
+    },
+    {
+      "epoch": 1.0410958904109588,
+      "grad_norm": 1.5452288389205933,
+      "learning_rate": 1e-06,
+      "loss": 2.2271,
+      "step": 19
+    },
+    {
+      "epoch": 1.095890410958904,
+      "grad_norm": 1.4995627403259277,
+      "learning_rate": 1e-06,
+      "loss": 2.2222,
+      "step": 20
+    },
+    {
+      "epoch": 1.1506849315068493,
+      "grad_norm": 1.4030557870864868,
+      "learning_rate": 1e-06,
+      "loss": 2.2547,
+      "step": 21
+    },
+    {
+      "epoch": 1.2054794520547945,
+      "grad_norm": 1.4066240787506104,
+      "learning_rate": 1e-06,
+      "loss": 2.2279,
+      "step": 22
+    },
+    {
+      "epoch": 1.2602739726027397,
+      "grad_norm": 1.4491875171661377,
+      "learning_rate": 1e-06,
+      "loss": 2.2497,
+      "step": 23
+    },
+    {
+      "epoch": 1.3150684931506849,
+      "grad_norm": 1.3880819082260132,
+      "learning_rate": 1e-06,
+      "loss": 2.2593,
+      "step": 24
+    },
+    {
+      "epoch": 1.36986301369863,
+      "grad_norm": 1.471488118171692,
+      "learning_rate": 1e-06,
+      "loss": 2.2496,
+      "step": 25
+    },
+    {
+      "epoch": 1.4246575342465753,
+      "grad_norm": 1.388680338859558,
+      "learning_rate": 1e-06,
+      "loss": 2.2262,
+      "step": 26
+    },
+    {
+      "epoch": 1.4794520547945205,
+      "grad_norm": 1.4523004293441772,
+      "learning_rate": 1e-06,
+      "loss": 2.2535,
+      "step": 27
+    },
+    {
+      "epoch": 1.5342465753424657,
+      "grad_norm": 1.4338841438293457,
+      "learning_rate": 1e-06,
+      "loss": 2.2315,
+      "step": 28
+    },
+    {
+      "epoch": 1.589041095890411,
+      "grad_norm": 1.3985637426376343,
+      "learning_rate": 1e-06,
+      "loss": 2.262,
+      "step": 29
+    },
+    {
+      "epoch": 1.643835616438356,
+      "grad_norm": 1.3776822090148926,
+      "learning_rate": 1e-06,
+      "loss": 2.224,
+      "step": 30
+    },
+    {
+      "epoch": 1.6986301369863015,
+      "grad_norm": 1.3197417259216309,
+      "learning_rate": 1e-06,
+      "loss": 2.2009,
+      "step": 31
+    },
+    {
+      "epoch": 1.7534246575342465,
+      "grad_norm": 1.4159483909606934,
+      "learning_rate": 1e-06,
+      "loss": 2.2131,
+      "step": 32
+    },
+    {
+      "epoch": 1.808219178082192,
+      "grad_norm": 1.3864014148712158,
+      "learning_rate": 1e-06,
+      "loss": 2.2498,
+      "step": 33
+    },
+    {
+      "epoch": 1.8630136986301369,
+      "grad_norm": 1.3488203287124634,
+      "learning_rate": 1e-06,
+      "loss": 2.2147,
+      "step": 34
+    },
+    {
+      "epoch": 1.9178082191780823,
+      "grad_norm": 1.345689296722412,
+      "learning_rate": 1e-06,
+      "loss": 2.2383,
+      "step": 35
+    },
+    {
+      "epoch": 1.9726027397260273,
+      "grad_norm": 1.344303011894226,
+      "learning_rate": 1e-06,
+      "loss": 2.2159,
+      "step": 36
+    },
+    {
+      "epoch": 2.0273972602739727,
+      "grad_norm": 1.3895442485809326,
+      "learning_rate": 1e-06,
+      "loss": 2.2265,
+      "step": 37
+    },
+    {
+      "epoch": 2.0821917808219177,
+      "grad_norm": 1.3593428134918213,
+      "learning_rate": 1e-06,
+      "loss": 2.2063,
+      "step": 38
+    },
+    {
+      "epoch": 2.136986301369863,
+      "grad_norm": 1.3060978651046753,
+      "learning_rate": 1e-06,
+      "loss": 2.2572,
+      "step": 39
+    },
+    {
+      "epoch": 2.191780821917808,
+      "grad_norm": 1.3199517726898193,
+      "learning_rate": 1e-06,
+      "loss": 2.2099,
+      "step": 40
+    },
+    {
+      "epoch": 2.2465753424657535,
+      "grad_norm": 1.3381460905075073,
+      "learning_rate": 1e-06,
+      "loss": 2.2693,
+      "step": 41
+    },
+    {
+      "epoch": 2.3013698630136985,
+      "grad_norm": 1.334553599357605,
+      "learning_rate": 1e-06,
+      "loss": 2.2206,
+      "step": 42
+    },
+    {
+      "epoch": 2.356164383561644,
+      "grad_norm": 1.3222883939743042,
+      "learning_rate": 1e-06,
+      "loss": 2.1851,
+      "step": 43
+    },
+    {
+      "epoch": 2.410958904109589,
+      "grad_norm": 1.3213746547698975,
+      "learning_rate": 1e-06,
+      "loss": 2.2542,
+      "step": 44
+    },
+    {
+      "epoch": 2.4657534246575343,
+      "grad_norm": 1.3214170932769775,
+      "learning_rate": 1e-06,
+      "loss": 2.2319,
+      "step": 45
+    },
+    {
+      "epoch": 2.5205479452054793,
+      "grad_norm": 1.345453143119812,
+      "learning_rate": 1e-06,
+      "loss": 2.222,
+      "step": 46
+    },
+    {
+      "epoch": 2.5753424657534247,
+      "grad_norm": 1.2182488441467285,
+      "learning_rate": 1e-06,
+      "loss": 2.2069,
+      "step": 47
+    },
+    {
+      "epoch": 2.6301369863013697,
+      "grad_norm": 1.2841640710830688,
+      "learning_rate": 1e-06,
+      "loss": 2.2181,
+      "step": 48
+    },
+    {
+      "epoch": 2.684931506849315,
+      "grad_norm": 1.270230770111084,
+      "learning_rate": 1e-06,
+      "loss": 2.2097,
+      "step": 49
+    },
+    {
+      "epoch": 2.73972602739726,
+      "grad_norm": 1.213972806930542,
+      "learning_rate": 1e-06,
+      "loss": 2.218,
+      "step": 50
+    },
+    {
+      "epoch": 2.7945205479452055,
+      "grad_norm": 1.2877941131591797,
+      "learning_rate": 1e-06,
+      "loss": 2.2055,
+      "step": 51
+    },
+    {
+      "epoch": 2.8493150684931505,
+      "grad_norm": 1.273301601409912,
+      "learning_rate": 1e-06,
+      "loss": 2.1895,
+      "step": 52
+    },
+    {
+      "epoch": 2.904109589041096,
+      "grad_norm": 1.2318782806396484,
+      "learning_rate": 1e-06,
+      "loss": 2.2255,
+      "step": 53
+    },
+    {
+      "epoch": 2.958904109589041,
+      "grad_norm": 1.1937693357467651,
+      "learning_rate": 1e-06,
+      "loss": 2.1865,
+      "step": 54
+    },
+    {
+      "epoch": 3.0136986301369864,
+      "grad_norm": 1.1707606315612793,
+      "learning_rate": 1e-06,
+      "loss": 2.2179,
+      "step": 55
+    },
+    {
+      "epoch": 3.0684931506849313,
+      "grad_norm": 1.2074235677719116,
+      "learning_rate": 1e-06,
+      "loss": 2.155,
+      "step": 56
+    },
+    {
+      "epoch": 3.1232876712328768,
+      "grad_norm": 1.1725316047668457,
+      "learning_rate": 1e-06,
+      "loss": 2.2011,
+      "step": 57
+    },
+    {
+      "epoch": 3.1780821917808217,
+      "grad_norm": 1.1967130899429321,
+      "learning_rate": 1e-06,
+      "loss": 2.2155,
+      "step": 58
+    },
+    {
+      "epoch": 3.232876712328767,
+      "grad_norm": 1.1932190656661987,
+      "learning_rate": 1e-06,
+      "loss": 2.1858,
+      "step": 59
+    },
+    {
+      "epoch": 3.287671232876712,
+      "grad_norm": 1.19328773021698,
+      "learning_rate": 1e-06,
+      "loss": 2.2351,
+      "step": 60
+    },
+    {
+      "epoch": 3.3424657534246576,
+      "grad_norm": 1.1168928146362305,
+      "learning_rate": 1e-06,
+      "loss": 2.2022,
+      "step": 61
+    },
+    {
+      "epoch": 3.3972602739726026,
+      "grad_norm": 1.2043449878692627,
+      "learning_rate": 1e-06,
+      "loss": 2.1964,
+      "step": 62
+    },
+    {
+      "epoch": 3.452054794520548,
+      "grad_norm": 1.2224105596542358,
+      "learning_rate": 1e-06,
+      "loss": 2.1919,
+      "step": 63
+    },
+    {
+      "epoch": 3.506849315068493,
+      "grad_norm": 1.2362271547317505,
+      "learning_rate": 1e-06,
+      "loss": 2.199,
+      "step": 64
+    },
+    {
+      "epoch": 3.5616438356164384,
+      "grad_norm": 1.2123560905456543,
+      "learning_rate": 1e-06,
+      "loss": 2.2357,
+      "step": 65
+    },
+    {
+      "epoch": 3.616438356164384,
+      "grad_norm": 1.1854863166809082,
+      "learning_rate": 1e-06,
+      "loss": 2.1878,
+      "step": 66
+    },
+    {
+      "epoch": 3.671232876712329,
+      "grad_norm": 1.1320362091064453,
+      "learning_rate": 1e-06,
+      "loss": 2.1872,
+      "step": 67
+    },
+    {
+      "epoch": 3.7260273972602738,
+      "grad_norm": 1.1633937358856201,
+      "learning_rate": 1e-06,
+      "loss": 2.205,
+      "step": 68
+    },
+    {
+      "epoch": 3.780821917808219,
+      "grad_norm": 1.1435497999191284,
+      "learning_rate": 1e-06,
+      "loss": 2.1972,
+      "step": 69
+    },
+    {
+      "epoch": 3.8356164383561646,
+      "grad_norm": 1.1820743083953857,
+      "learning_rate": 1e-06,
+      "loss": 2.1961,
+      "step": 70
+    },
+    {
+      "epoch": 3.8904109589041096,
+      "grad_norm": 1.203647255897522,
+      "learning_rate": 1e-06,
+      "loss": 2.2149,
+      "step": 71
+    },
+    {
+      "epoch": 3.9452054794520546,
+      "grad_norm": 1.1167892217636108,
+      "learning_rate": 1e-06,
+      "loss": 2.197,
+      "step": 72
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.0951488018035889,
+      "learning_rate": 1e-06,
+      "loss": 2.1898,
+      "step": 73
+    },
+    {
+      "epoch": 4.054794520547945,
+      "grad_norm": 1.1908702850341797,
+      "learning_rate": 1e-06,
+      "loss": 2.1973,
+      "step": 74
+    },
+    {
+      "epoch": 4.109589041095891,
+      "grad_norm": 1.0710009336471558,
+      "learning_rate": 1e-06,
+      "loss": 2.2014,
+      "step": 75
+    },
+    {
+      "epoch": 4.164383561643835,
+      "grad_norm": 1.1268314123153687,
+      "learning_rate": 1e-06,
+      "loss": 2.2125,
+      "step": 76
+    },
+    {
+      "epoch": 4.219178082191781,
+      "grad_norm": 1.0808967351913452,
+      "learning_rate": 1e-06,
+      "loss": 2.2184,
+      "step": 77
+    },
+    {
+      "epoch": 4.273972602739726,
+      "grad_norm": 1.0744292736053467,
+      "learning_rate": 1e-06,
+      "loss": 2.162,
+      "step": 78
+    },
+    {
+      "epoch": 4.328767123287671,
+      "grad_norm": 1.0902713537216187,
+      "learning_rate": 1e-06,
+      "loss": 2.2045,
+      "step": 79
+    },
+    {
+      "epoch": 4.383561643835616,
+      "grad_norm": 1.1404340267181396,
+      "learning_rate": 1e-06,
+      "loss": 2.1919,
+      "step": 80
+    },
+    {
+      "epoch": 4.438356164383562,
+      "grad_norm": 1.0819721221923828,
+      "learning_rate": 1e-06,
+      "loss": 2.1848,
+      "step": 81
+    },
+    {
+      "epoch": 4.493150684931507,
+      "grad_norm": 1.0939464569091797,
+      "learning_rate": 1e-06,
+      "loss": 2.197,
+      "step": 82
+    },
+    {
+      "epoch": 4.5479452054794525,
+      "grad_norm": 1.1371257305145264,
+      "learning_rate": 1e-06,
+      "loss": 2.1802,
+      "step": 83
+    },
+    {
+      "epoch": 4.602739726027397,
+      "grad_norm": 1.0913671255111694,
+      "learning_rate": 1e-06,
+      "loss": 2.182,
+      "step": 84
+    },
+    {
+      "epoch": 4.657534246575342,
+      "grad_norm": 1.0597493648529053,
+      "learning_rate": 1e-06,
+      "loss": 2.1663,
+      "step": 85
+    },
+    {
+      "epoch": 4.712328767123288,
+      "grad_norm": 1.040493130683899,
+      "learning_rate": 1e-06,
+      "loss": 2.1774,
+      "step": 86
+    },
+    {
+      "epoch": 4.767123287671232,
+      "grad_norm": 1.0556532144546509,
+      "learning_rate": 1e-06,
+      "loss": 2.2029,
+      "step": 87
+    },
+    {
+      "epoch": 4.821917808219178,
+      "grad_norm": 1.0801831483840942,
+      "learning_rate": 1e-06,
+      "loss": 2.1648,
+      "step": 88
+    },
+    {
+      "epoch": 4.876712328767123,
+      "grad_norm": 1.073749303817749,
+      "learning_rate": 1e-06,
+      "loss": 2.174,
+      "step": 89
+    },
+    {
+      "epoch": 4.931506849315069,
+      "grad_norm": 1.0210574865341187,
+      "learning_rate": 1e-06,
+      "loss": 2.1474,
+      "step": 90
+    },
+    {
+      "epoch": 4.986301369863014,
+      "grad_norm": 1.0152342319488525,
+      "learning_rate": 1e-06,
+      "loss": 2.1629,
+      "step": 91
+    },
+    {
+      "epoch": 5.041095890410959,
+      "grad_norm": 1.0388507843017578,
+      "learning_rate": 1e-06,
+      "loss": 2.1931,
+      "step": 92
+    },
+    {
+      "epoch": 5.095890410958904,
+      "grad_norm": 1.011426329612732,
+      "learning_rate": 1e-06,
+      "loss": 2.204,
+      "step": 93
+    },
+    {
+      "epoch": 5.1506849315068495,
+      "grad_norm": 1.0486528873443604,
+      "learning_rate": 1e-06,
+      "loss": 2.1908,
+      "step": 94
+    },
+    {
+      "epoch": 5.205479452054795,
+      "grad_norm": 0.9501799941062927,
+      "learning_rate": 1e-06,
+      "loss": 2.1823,
+      "step": 95
+    },
+    {
+      "epoch": 5.260273972602739,
+      "grad_norm": 1.0336531400680542,
+      "learning_rate": 1e-06,
+      "loss": 2.1965,
+      "step": 96
+    },
+    {
+      "epoch": 5.315068493150685,
+      "grad_norm": 1.0227267742156982,
+      "learning_rate": 1e-06,
+      "loss": 2.1896,
+      "step": 97
+    },
+    {
+      "epoch": 5.36986301369863,
+      "grad_norm": 1.0686023235321045,
+      "learning_rate": 1e-06,
+      "loss": 2.1496,
+      "step": 98
+    },
+    {
+      "epoch": 5.424657534246576,
+      "grad_norm": 0.9931809902191162,
+      "learning_rate": 1e-06,
+      "loss": 2.1474,
+      "step": 99
+    },
+    {
+      "epoch": 5.47945205479452,
+      "grad_norm": 0.9578049778938293,
+      "learning_rate": 1e-06,
+      "loss": 2.1488,
+      "step": 100
+    },
+    {
+      "epoch": 5.534246575342466,
+      "grad_norm": 0.9815987944602966,
+      "learning_rate": 1e-06,
+      "loss": 2.1755,
+      "step": 101
+    },
+    {
+      "epoch": 5.589041095890411,
+      "grad_norm": 0.9837309718132019,
+      "learning_rate": 1e-06,
+      "loss": 2.1559,
+      "step": 102
+    },
+    {
+      "epoch": 5.6438356164383565,
+      "grad_norm": 0.9334861040115356,
+      "learning_rate": 1e-06,
+      "loss": 2.1773,
+      "step": 103
+    },
+    {
+      "epoch": 5.698630136986301,
+      "grad_norm": 1.0627118349075317,
+      "learning_rate": 1e-06,
+      "loss": 2.2116,
+      "step": 104
+    },
+    {
+      "epoch": 5.7534246575342465,
+      "grad_norm": 0.9978325963020325,
+      "learning_rate": 1e-06,
+      "loss": 2.1413,
+      "step": 105
+    },
+    {
+      "epoch": 5.808219178082192,
+      "grad_norm": 0.9550198912620544,
+      "learning_rate": 1e-06,
+      "loss": 2.1535,
+      "step": 106
+    },
+    {
+      "epoch": 5.863013698630137,
+      "grad_norm": 0.9339421987533569,
+      "learning_rate": 1e-06,
+      "loss": 2.1504,
+      "step": 107
+    },
+    {
+      "epoch": 5.917808219178082,
+      "grad_norm": 0.9043423533439636,
+      "learning_rate": 1e-06,
+      "loss": 2.1469,
+      "step": 108
+    },
+    {
+      "epoch": 5.972602739726027,
+      "grad_norm": 0.921292781829834,
+      "learning_rate": 1e-06,
+      "loss": 2.1337,
+      "step": 109
+    },
+    {
+      "epoch": 6.027397260273973,
+      "grad_norm": 0.9245712757110596,
+      "learning_rate": 1e-06,
+      "loss": 2.1762,
+      "step": 110
+    },
+    {
+      "epoch": 6.082191780821918,
+      "grad_norm": 0.9610967636108398,
+      "learning_rate": 1e-06,
+      "loss": 2.1618,
+      "step": 111
+    },
+    {
+      "epoch": 6.136986301369863,
+      "grad_norm": 0.9136860370635986,
+      "learning_rate": 1e-06,
+      "loss": 2.1505,
+      "step": 112
+    },
+    {
+      "epoch": 6.191780821917808,
+      "grad_norm": 0.9340102672576904,
+      "learning_rate": 1e-06,
+      "loss": 2.1692,
+      "step": 113
+    },
+    {
+      "epoch": 6.2465753424657535,
+      "grad_norm": 0.8885300159454346,
+      "learning_rate": 1e-06,
+      "loss": 2.1494,
+      "step": 114
+    },
+    {
+      "epoch": 6.301369863013699,
+      "grad_norm": 0.917847216129303,
+      "learning_rate": 1e-06,
+      "loss": 2.1503,
+      "step": 115
+    },
+    {
+      "epoch": 6.3561643835616435,
+      "grad_norm": 0.9519619345664978,
+      "learning_rate": 1e-06,
+      "loss": 2.1766,
+      "step": 116
+    },
+    {
+      "epoch": 6.410958904109589,
+      "grad_norm": 0.8926482200622559,
+      "learning_rate": 1e-06,
+      "loss": 2.1493,
+      "step": 117
+    },
+    {
+      "epoch": 6.465753424657534,
+      "grad_norm": 0.817862868309021,
+      "learning_rate": 1e-06,
+      "loss": 2.166,
+      "step": 118
+    },
+    {
+      "epoch": 6.52054794520548,
+      "grad_norm": 0.8948012590408325,
+      "learning_rate": 1e-06,
+      "loss": 2.1346,
+      "step": 119
+    },
+    {
+      "epoch": 6.575342465753424,
+      "grad_norm": 0.9632709622383118,
+      "learning_rate": 1e-06,
+      "loss": 2.1427,
+      "step": 120
+    },
+    {
+      "epoch": 6.63013698630137,
+      "grad_norm": 0.9267117381095886,
+      "learning_rate": 1e-06,
+      "loss": 2.1581,
+      "step": 121
+    },
+    {
+      "epoch": 6.684931506849315,
+      "grad_norm": 0.9063679575920105,
+      "learning_rate": 1e-06,
+      "loss": 2.1453,
+      "step": 122
+    },
+    {
+      "epoch": 6.739726027397261,
+      "grad_norm": 0.9395270347595215,
+      "learning_rate": 1e-06,
+      "loss": 2.1515,
+      "step": 123
+    },
+    {
+      "epoch": 6.794520547945205,
+      "grad_norm": 0.9410396218299866,
+      "learning_rate": 1e-06,
+      "loss": 2.1518,
+      "step": 124
+    },
+    {
+      "epoch": 6.8493150684931505,
+      "grad_norm": 0.9229517579078674,
+      "learning_rate": 1e-06,
+      "loss": 2.1703,
+      "step": 125
+    },
+    {
+      "epoch": 6.904109589041096,
+      "grad_norm": 0.8469845652580261,
+      "learning_rate": 1e-06,
+      "loss": 2.1491,
+      "step": 126
+    },
+    {
+      "epoch": 6.958904109589041,
+      "grad_norm": 0.9080257415771484,
+      "learning_rate": 1e-06,
+      "loss": 2.1472,
+      "step": 127
+    },
+    {
+      "epoch": 7.013698630136986,
+      "grad_norm": 0.9071102142333984,
+      "learning_rate": 1e-06,
+      "loss": 2.1685,
+      "step": 128
+    },
+    {
+      "epoch": 7.068493150684931,
+      "grad_norm": 0.8933852910995483,
+      "learning_rate": 1e-06,
+      "loss": 2.1617,
+      "step": 129
+    },
+    {
+      "epoch": 7.123287671232877,
+      "grad_norm": 0.9227753281593323,
+      "learning_rate": 1e-06,
+      "loss": 2.1617,
+      "step": 130
+    },
+    {
+      "epoch": 7.178082191780822,
+      "grad_norm": 0.8686262965202332,
+      "learning_rate": 1e-06,
+      "loss": 2.1546,
+      "step": 131
+    },
+    {
+      "epoch": 7.232876712328767,
+      "grad_norm": 0.8385916948318481,
+      "learning_rate": 1e-06,
+      "loss": 2.1442,
+      "step": 132
+    },
+    {
+      "epoch": 7.287671232876712,
+      "grad_norm": 0.8217021822929382,
+      "learning_rate": 1e-06,
+      "loss": 2.1606,
+      "step": 133
+    },
+    {
+      "epoch": 7.342465753424658,
+      "grad_norm": 0.862777590751648,
+      "learning_rate": 1e-06,
+      "loss": 2.153,
+      "step": 134
+    },
+    {
+      "epoch": 7.397260273972603,
+      "grad_norm": 0.8956757187843323,
+      "learning_rate": 1e-06,
+      "loss": 2.1807,
+      "step": 135
+    },
+    {
+      "epoch": 7.4520547945205475,
+      "grad_norm": 0.781984806060791,
+      "learning_rate": 1e-06,
+      "loss": 2.1469,
+      "step": 136
+    },
+    {
+      "epoch": 7.506849315068493,
+      "grad_norm": 0.8100602030754089,
+      "learning_rate": 1e-06,
+      "loss": 2.107,
+      "step": 137
+    },
+    {
+      "epoch": 7.561643835616438,
+      "grad_norm": 0.8204404711723328,
+      "learning_rate": 1e-06,
+      "loss": 2.1477,
+      "step": 138
+    },
+    {
+      "epoch": 7.616438356164384,
+      "grad_norm": 0.8198928236961365,
+      "learning_rate": 1e-06,
+      "loss": 2.1514,
+      "step": 139
+    },
+    {
+      "epoch": 7.671232876712329,
+      "grad_norm": 0.8388807773590088,
+      "learning_rate": 1e-06,
+      "loss": 2.1265,
+      "step": 140
+    },
+    {
+      "epoch": 7.726027397260274,
+      "grad_norm": 0.8662092089653015,
+      "learning_rate": 1e-06,
+      "loss": 2.1316,
+      "step": 141
+    },
+    {
+      "epoch": 7.780821917808219,
+      "grad_norm": 0.7682031393051147,
+      "learning_rate": 1e-06,
+      "loss": 2.1164,
+      "step": 142
+    },
+    {
+      "epoch": 7.835616438356165,
+      "grad_norm": 0.796292781829834,
+      "learning_rate": 1e-06,
+      "loss": 2.1342,
+      "step": 143
+    },
+    {
+      "epoch": 7.890410958904109,
+      "grad_norm": 0.8075994253158569,
+      "learning_rate": 1e-06,
+      "loss": 2.1221,
+      "step": 144
+    },
+    {
+      "epoch": 7.945205479452055,
+      "grad_norm": 0.8507598638534546,
+      "learning_rate": 1e-06,
+      "loss": 2.1513,
+      "step": 145
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.768495500087738,
+      "learning_rate": 1e-06,
+      "loss": 2.1369,
+      "step": 146
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 540,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 30,
+  "save_steps": 500,
+  "total_flos": 1.0225056854153626e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9878e4831c5c04dc6b7feec2819c9ae2ca129a83ad0ca3012c0b67edeb1c7852
+size 5048