joseagmz commited on
Commit
4a44a9d
1 Parent(s): 4ac8f52

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: TinyLlama-Real-PsychiatryCaseNotes-epochs-1-lr-0002
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
15
+ <details><summary>See axolotl config</summary>
16
+
17
+ axolotl version: `0.4.0`
18
+ ```yaml
19
+ adapter: null
20
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
21
+ bf16: auto
22
+ dataset_prepared_path: last_run_prepared
23
+ datasets:
24
+ - path: utrgvseniorproject/PsychiatryCaseNotes
25
+ type: completion
26
+ debug: null
27
+ deepspeed: null
28
+ early_stopping_patience: null
29
+ eval_sample_packing: false
30
+ eval_table_size: null
31
+ evals_per_epoch: 4
32
+ flash_attention: true
33
+ flash_attn_cross_entropy: false
34
+ flash_attn_fuse_mlp: true
35
+ flash_attn_fuse_qkv: false
36
+ flash_attn_rms_norm: true
37
+ fp16: null
38
+ fsdp: null
39
+ fsdp_config: null
40
+ gradient_accumulation_steps: 1
41
+ gradient_checkpointing: true
42
+ group_by_length: false
43
+ learning_rate: 0.0002
44
+ load_in_4bit: false
45
+ load_in_8bit: false
46
+ local_rank: null
47
+ logging_steps: 1
48
+ lora_alpha: null
49
+ lora_dropout: null
50
+ lora_fan_in_fan_out: null
51
+ lora_model_dir: null
52
+ lora_r: null
53
+ lora_target_linear: null
54
+ lr_scheduler: cosine
55
+ micro_batch_size: 1
56
+ model_type: LlamaForCausalLM
57
+ num_epochs: 1
58
+ optimizer: adamw_bnb_8bit
59
+ output_dir: ./TinyLlama-Real-PsychiatryCaseNotes-epochs-1-lr-0002
60
+ pad_to_sequence_len: true
61
+ resume_from_checkpoint: null
62
+ sample_packing: true
63
+ saves_per_epoch: 1
64
+ sequence_len: 2048
65
+ special_tokens: null
66
+ strict: false
67
+ tf32: false
68
+ tokenizer_type: LlamaTokenizer
69
+ train_on_inputs: false
70
+ val_set_size: 0.05
71
+ wandb_entity: utrgvmedai
72
+ wandb_log_model: null
73
+ wandb_name: tinyLama_colab_test_3
74
+ wandb_project: TinyLlama-Real-PsychiatryCaseNotes-epochs-1-lr-0002
75
+ wandb_watch: null
76
+ warmup_steps: 100
77
+ weight_decay: 0.1
78
+ xformers_attention: null
79
+
80
+ ```
81
+
82
+ </details><br>
83
+
84
+ # TinyLlama-Real-PsychiatryCaseNotes-epochs-1-lr-0002
85
+
86
+ This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset.
87
+ It achieves the following results on the evaluation set:
88
+ - Loss: 2.3789
89
+
90
+ ## Model description
91
+
92
+ More information needed
93
+
94
+ ## Intended uses & limitations
95
+
96
+ More information needed
97
+
98
+ ## Training and evaluation data
99
+
100
+ More information needed
101
+
102
+ ## Training procedure
103
+
104
+ ### Training hyperparameters
105
+
106
+ The following hyperparameters were used during training:
107
+ - learning_rate: 0.0002
108
+ - train_batch_size: 1
109
+ - eval_batch_size: 1
110
+ - seed: 42
111
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
112
+ - lr_scheduler_type: cosine
113
+ - lr_scheduler_warmup_steps: 100
114
+ - num_epochs: 1
115
+
116
+ ### Training results
117
+
118
+ | Training Loss | Epoch | Step | Validation Loss |
119
+ |:-------------:|:-----:|:----:|:---------------:|
120
+ | 5.2368 | 0.01 | 1 | 4.7826 |
121
+ | 2.2757 | 0.25 | 48 | 2.3982 |
122
+ | 2.5869 | 0.51 | 96 | 2.5119 |
123
+ | 2.664 | 0.76 | 144 | 2.3789 |
124
+
125
+
126
+ ### Framework versions
127
+
128
+ - Transformers 4.38.2
129
+ - Pytorch 2.1.2+cu121
130
+ - Datasets 2.18.0
131
+ - Tokenizers 0.15.0
checkpoint-190/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 22,
18
+ "num_key_value_heads": 4,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": null,
22
+ "rope_theta": 10000.0,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": false,
27
+ "vocab_size": 32000
28
+ }
checkpoint-190/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 2048,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.38.2"
8
+ }
checkpoint-190/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c94fc6ab29fc4560201af9f53e39d552e1323984a0bdf05b6a28a68091b5c0b1
3
+ size 2200117448
checkpoint-190/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3f45e4963b975a971c05b85610744fc1e612dff7b943843710f15a3281cd348
3
+ size 2205132090
checkpoint-190/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
3
+ size 14244
checkpoint-190/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a1f0c95b57cb9fc9522d06668539442e29c9ba28b246eca9df75bdbd91dc225
3
+ size 1064
checkpoint-190/trainer_state.json ADDED
@@ -0,0 +1,1383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 48,
6
+ "global_step": 190,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 51.5,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 5.2368,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "eval_loss": 4.782614707946777,
21
+ "eval_runtime": 50.8138,
22
+ "eval_samples_per_second": 20.742,
23
+ "eval_steps_per_second": 20.742,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.01,
28
+ "grad_norm": 48.25,
29
+ "learning_rate": 4.000000000000001e-06,
30
+ "loss": 4.8965,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.02,
35
+ "grad_norm": 49.25,
36
+ "learning_rate": 6e-06,
37
+ "loss": 5.0969,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.02,
42
+ "grad_norm": 51.5,
43
+ "learning_rate": 8.000000000000001e-06,
44
+ "loss": 5.0122,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.03,
49
+ "grad_norm": 48.75,
50
+ "learning_rate": 1e-05,
51
+ "loss": 5.2505,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.03,
56
+ "grad_norm": 45.25,
57
+ "learning_rate": 1.2e-05,
58
+ "loss": 4.8966,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.04,
63
+ "grad_norm": 37.5,
64
+ "learning_rate": 1.4000000000000001e-05,
65
+ "loss": 4.4979,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.04,
70
+ "grad_norm": 30.125,
71
+ "learning_rate": 1.6000000000000003e-05,
72
+ "loss": 4.2038,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.05,
77
+ "grad_norm": 29.625,
78
+ "learning_rate": 1.8e-05,
79
+ "loss": 4.3498,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.05,
84
+ "grad_norm": 27.5,
85
+ "learning_rate": 2e-05,
86
+ "loss": 4.0182,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.06,
91
+ "grad_norm": 29.875,
92
+ "learning_rate": 2.2000000000000003e-05,
93
+ "loss": 3.6018,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.06,
98
+ "grad_norm": 27.375,
99
+ "learning_rate": 2.4e-05,
100
+ "loss": 3.4676,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.07,
105
+ "grad_norm": 33.5,
106
+ "learning_rate": 2.6000000000000002e-05,
107
+ "loss": 3.1663,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.07,
112
+ "grad_norm": 20.0,
113
+ "learning_rate": 2.8000000000000003e-05,
114
+ "loss": 3.0102,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.08,
119
+ "grad_norm": 15.5,
120
+ "learning_rate": 3e-05,
121
+ "loss": 2.9281,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.08,
126
+ "grad_norm": 17.75,
127
+ "learning_rate": 3.2000000000000005e-05,
128
+ "loss": 2.7986,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.09,
133
+ "grad_norm": 11.0,
134
+ "learning_rate": 3.4000000000000007e-05,
135
+ "loss": 2.7211,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.09,
140
+ "grad_norm": 15.5,
141
+ "learning_rate": 3.6e-05,
142
+ "loss": 2.5806,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.1,
147
+ "grad_norm": 17.375,
148
+ "learning_rate": 3.8e-05,
149
+ "loss": 2.4546,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.11,
154
+ "grad_norm": 10.0625,
155
+ "learning_rate": 4e-05,
156
+ "loss": 2.6552,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.11,
161
+ "grad_norm": 11.0625,
162
+ "learning_rate": 4.2e-05,
163
+ "loss": 2.4723,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.12,
168
+ "grad_norm": 10.375,
169
+ "learning_rate": 4.4000000000000006e-05,
170
+ "loss": 2.5615,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.12,
175
+ "grad_norm": 12.5,
176
+ "learning_rate": 4.600000000000001e-05,
177
+ "loss": 2.4721,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.13,
182
+ "grad_norm": 8.75,
183
+ "learning_rate": 4.8e-05,
184
+ "loss": 2.3344,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.13,
189
+ "grad_norm": 14.25,
190
+ "learning_rate": 5e-05,
191
+ "loss": 2.4028,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.14,
196
+ "grad_norm": 9.5625,
197
+ "learning_rate": 5.2000000000000004e-05,
198
+ "loss": 2.3864,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.14,
203
+ "grad_norm": 11.75,
204
+ "learning_rate": 5.4000000000000005e-05,
205
+ "loss": 2.3027,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.15,
210
+ "grad_norm": 11.5625,
211
+ "learning_rate": 5.6000000000000006e-05,
212
+ "loss": 2.4142,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.15,
217
+ "grad_norm": 8.5625,
218
+ "learning_rate": 5.8e-05,
219
+ "loss": 2.3577,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.16,
224
+ "grad_norm": 10.8125,
225
+ "learning_rate": 6e-05,
226
+ "loss": 2.2604,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.16,
231
+ "grad_norm": 12.0625,
232
+ "learning_rate": 6.2e-05,
233
+ "loss": 2.1379,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.17,
238
+ "grad_norm": 12.875,
239
+ "learning_rate": 6.400000000000001e-05,
240
+ "loss": 2.2754,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.17,
245
+ "grad_norm": 14.1875,
246
+ "learning_rate": 6.6e-05,
247
+ "loss": 2.1848,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.18,
252
+ "grad_norm": 12.5625,
253
+ "learning_rate": 6.800000000000001e-05,
254
+ "loss": 2.2751,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.18,
259
+ "grad_norm": 10.125,
260
+ "learning_rate": 7e-05,
261
+ "loss": 2.2342,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.19,
266
+ "grad_norm": 10.3125,
267
+ "learning_rate": 7.2e-05,
268
+ "loss": 2.1432,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.19,
273
+ "grad_norm": 15.5,
274
+ "learning_rate": 7.4e-05,
275
+ "loss": 2.4477,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.2,
280
+ "grad_norm": 13.0625,
281
+ "learning_rate": 7.6e-05,
282
+ "loss": 2.4045,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.21,
287
+ "grad_norm": 9.125,
288
+ "learning_rate": 7.800000000000001e-05,
289
+ "loss": 2.4156,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.21,
294
+ "grad_norm": 10.5,
295
+ "learning_rate": 8e-05,
296
+ "loss": 2.2679,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.22,
301
+ "grad_norm": 11.75,
302
+ "learning_rate": 8.2e-05,
303
+ "loss": 2.3045,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.22,
308
+ "grad_norm": 13.5,
309
+ "learning_rate": 8.4e-05,
310
+ "loss": 2.0572,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.23,
315
+ "grad_norm": 13.75,
316
+ "learning_rate": 8.6e-05,
317
+ "loss": 2.1523,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.23,
322
+ "grad_norm": 11.875,
323
+ "learning_rate": 8.800000000000001e-05,
324
+ "loss": 2.1531,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 0.24,
329
+ "grad_norm": 11.6875,
330
+ "learning_rate": 9e-05,
331
+ "loss": 1.9768,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 0.24,
336
+ "grad_norm": 8.125,
337
+ "learning_rate": 9.200000000000001e-05,
338
+ "loss": 2.4555,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 0.25,
343
+ "grad_norm": 6.625,
344
+ "learning_rate": 9.4e-05,
345
+ "loss": 2.3451,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 0.25,
350
+ "grad_norm": 16.5,
351
+ "learning_rate": 9.6e-05,
352
+ "loss": 2.2757,
353
+ "step": 48
354
+ },
355
+ {
356
+ "epoch": 0.25,
357
+ "eval_loss": 2.3981776237487793,
358
+ "eval_runtime": 50.6747,
359
+ "eval_samples_per_second": 20.799,
360
+ "eval_steps_per_second": 20.799,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.26,
365
+ "grad_norm": 16.25,
366
+ "learning_rate": 9.8e-05,
367
+ "loss": 2.4249,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.26,
372
+ "grad_norm": 5.84375,
373
+ "learning_rate": 0.0001,
374
+ "loss": 2.306,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.27,
379
+ "grad_norm": 7.75,
380
+ "learning_rate": 0.00010200000000000001,
381
+ "loss": 2.4711,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.27,
386
+ "grad_norm": 9.1875,
387
+ "learning_rate": 0.00010400000000000001,
388
+ "loss": 2.3926,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.28,
393
+ "grad_norm": 13.125,
394
+ "learning_rate": 0.00010600000000000002,
395
+ "loss": 2.1229,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.28,
400
+ "grad_norm": 9.8125,
401
+ "learning_rate": 0.00010800000000000001,
402
+ "loss": 2.0945,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.29,
407
+ "grad_norm": 12.375,
408
+ "learning_rate": 0.00011000000000000002,
409
+ "loss": 2.248,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.29,
414
+ "grad_norm": 8.125,
415
+ "learning_rate": 0.00011200000000000001,
416
+ "loss": 2.349,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.3,
421
+ "grad_norm": 15.5625,
422
+ "learning_rate": 0.00011399999999999999,
423
+ "loss": 2.2546,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.31,
428
+ "grad_norm": 13.5,
429
+ "learning_rate": 0.000116,
430
+ "loss": 2.4298,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.31,
435
+ "grad_norm": 8.3125,
436
+ "learning_rate": 0.000118,
437
+ "loss": 2.1131,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 0.32,
442
+ "grad_norm": 8.625,
443
+ "learning_rate": 0.00012,
444
+ "loss": 2.2696,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 0.32,
449
+ "grad_norm": 6.21875,
450
+ "learning_rate": 0.000122,
451
+ "loss": 2.244,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.33,
456
+ "grad_norm": 5.875,
457
+ "learning_rate": 0.000124,
458
+ "loss": 2.1473,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.33,
463
+ "grad_norm": 7.28125,
464
+ "learning_rate": 0.000126,
465
+ "loss": 2.2231,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.34,
470
+ "grad_norm": 5.1875,
471
+ "learning_rate": 0.00012800000000000002,
472
+ "loss": 2.0796,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 0.34,
477
+ "grad_norm": 7.59375,
478
+ "learning_rate": 0.00013000000000000002,
479
+ "loss": 2.3191,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 0.35,
484
+ "grad_norm": 6.53125,
485
+ "learning_rate": 0.000132,
486
+ "loss": 2.214,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.35,
491
+ "grad_norm": 8.375,
492
+ "learning_rate": 0.000134,
493
+ "loss": 2.3527,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.36,
498
+ "grad_norm": 7.09375,
499
+ "learning_rate": 0.00013600000000000003,
500
+ "loss": 2.3337,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.36,
505
+ "grad_norm": 7.28125,
506
+ "learning_rate": 0.000138,
507
+ "loss": 2.2933,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.37,
512
+ "grad_norm": 19.75,
513
+ "learning_rate": 0.00014,
514
+ "loss": 2.3885,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.37,
519
+ "grad_norm": 16.625,
520
+ "learning_rate": 0.000142,
521
+ "loss": 2.348,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.38,
526
+ "grad_norm": 6.0,
527
+ "learning_rate": 0.000144,
528
+ "loss": 2.2594,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.38,
533
+ "grad_norm": 11.875,
534
+ "learning_rate": 0.000146,
535
+ "loss": 2.5611,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.39,
540
+ "grad_norm": 5.875,
541
+ "learning_rate": 0.000148,
542
+ "loss": 2.0386,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.39,
547
+ "grad_norm": 16.0,
548
+ "learning_rate": 0.00015000000000000001,
549
+ "loss": 2.656,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.4,
554
+ "grad_norm": 8.1875,
555
+ "learning_rate": 0.000152,
556
+ "loss": 2.3885,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.41,
561
+ "grad_norm": 11.375,
562
+ "learning_rate": 0.000154,
563
+ "loss": 2.701,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.41,
568
+ "grad_norm": 8.75,
569
+ "learning_rate": 0.00015600000000000002,
570
+ "loss": 2.5388,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.42,
575
+ "grad_norm": 7.84375,
576
+ "learning_rate": 0.00015800000000000002,
577
+ "loss": 2.5263,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.42,
582
+ "grad_norm": 6.3125,
583
+ "learning_rate": 0.00016,
584
+ "loss": 2.7169,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.43,
589
+ "grad_norm": 19.125,
590
+ "learning_rate": 0.000162,
591
+ "loss": 2.504,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.43,
596
+ "grad_norm": 8.25,
597
+ "learning_rate": 0.000164,
598
+ "loss": 2.4614,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.44,
603
+ "grad_norm": 5.5625,
604
+ "learning_rate": 0.000166,
605
+ "loss": 2.7586,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.44,
610
+ "grad_norm": 8.3125,
611
+ "learning_rate": 0.000168,
612
+ "loss": 2.6373,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.45,
617
+ "grad_norm": 8.0625,
618
+ "learning_rate": 0.00017,
619
+ "loss": 2.3237,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.45,
624
+ "grad_norm": 8.4375,
625
+ "learning_rate": 0.000172,
626
+ "loss": 2.1896,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.46,
631
+ "grad_norm": 11.5625,
632
+ "learning_rate": 0.000174,
633
+ "loss": 2.5089,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.46,
638
+ "grad_norm": 7.15625,
639
+ "learning_rate": 0.00017600000000000002,
640
+ "loss": 2.5806,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.47,
645
+ "grad_norm": 8.875,
646
+ "learning_rate": 0.00017800000000000002,
647
+ "loss": 2.5497,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.47,
652
+ "grad_norm": 8.6875,
653
+ "learning_rate": 0.00018,
654
+ "loss": 2.3526,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.48,
659
+ "grad_norm": 5.96875,
660
+ "learning_rate": 0.000182,
661
+ "loss": 2.283,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.48,
666
+ "grad_norm": 6.375,
667
+ "learning_rate": 0.00018400000000000003,
668
+ "loss": 2.5388,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.49,
673
+ "grad_norm": 5.5625,
674
+ "learning_rate": 0.00018600000000000002,
675
+ "loss": 2.4216,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.49,
680
+ "grad_norm": 5.5625,
681
+ "learning_rate": 0.000188,
682
+ "loss": 2.4199,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.5,
687
+ "grad_norm": 6.5,
688
+ "learning_rate": 0.00019,
689
+ "loss": 2.6459,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.51,
694
+ "grad_norm": 5.46875,
695
+ "learning_rate": 0.000192,
696
+ "loss": 2.5869,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.51,
701
+ "eval_loss": 2.5119166374206543,
702
+ "eval_runtime": 50.7277,
703
+ "eval_samples_per_second": 20.778,
704
+ "eval_steps_per_second": 20.778,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.51,
709
+ "grad_norm": 5.3125,
710
+ "learning_rate": 0.000194,
711
+ "loss": 2.6162,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 0.52,
716
+ "grad_norm": 6.5625,
717
+ "learning_rate": 0.000196,
718
+ "loss": 2.605,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 0.52,
723
+ "grad_norm": 4.9375,
724
+ "learning_rate": 0.00019800000000000002,
725
+ "loss": 2.4971,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 0.53,
730
+ "grad_norm": 8.8125,
731
+ "learning_rate": 0.0002,
732
+ "loss": 2.4572,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.53,
737
+ "grad_norm": 5.96875,
738
+ "learning_rate": 0.0001999390827019096,
739
+ "loss": 2.4781,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.54,
744
+ "grad_norm": 9.3125,
745
+ "learning_rate": 0.00019975640502598244,
746
+ "loss": 2.5578,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.54,
751
+ "grad_norm": 5.625,
752
+ "learning_rate": 0.00019945218953682734,
753
+ "loss": 2.4608,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.55,
758
+ "grad_norm": 6.65625,
759
+ "learning_rate": 0.00019902680687415705,
760
+ "loss": 2.874,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.55,
765
+ "grad_norm": 10.875,
766
+ "learning_rate": 0.00019848077530122083,
767
+ "loss": 2.8733,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.56,
772
+ "grad_norm": 1568.0,
773
+ "learning_rate": 0.00019781476007338058,
774
+ "loss": 6.3646,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.56,
779
+ "grad_norm": 648.0,
780
+ "learning_rate": 0.00019702957262759965,
781
+ "loss": 3.75,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.57,
786
+ "grad_norm": 11.0625,
787
+ "learning_rate": 0.0001961261695938319,
788
+ "loss": 2.8153,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.57,
793
+ "grad_norm": 6.375,
794
+ "learning_rate": 0.00019510565162951537,
795
+ "loss": 2.7939,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.58,
800
+ "grad_norm": 8.25,
801
+ "learning_rate": 0.00019396926207859084,
802
+ "loss": 2.8736,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.58,
807
+ "grad_norm": 7.21875,
808
+ "learning_rate": 0.00019271838545667876,
809
+ "loss": 2.4797,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.59,
814
+ "grad_norm": 5.6875,
815
+ "learning_rate": 0.0001913545457642601,
816
+ "loss": 2.7665,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.59,
821
+ "grad_norm": 6.625,
822
+ "learning_rate": 0.0001898794046299167,
823
+ "loss": 2.6241,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.6,
828
+ "grad_norm": 4.65625,
829
+ "learning_rate": 0.00018829475928589271,
830
+ "loss": 2.5569,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.61,
835
+ "grad_norm": 6.6875,
836
+ "learning_rate": 0.00018660254037844388,
837
+ "loss": 2.6452,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.61,
842
+ "grad_norm": 4.6875,
843
+ "learning_rate": 0.0001848048096156426,
844
+ "loss": 2.6205,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.62,
849
+ "grad_norm": 6.53125,
850
+ "learning_rate": 0.00018290375725550417,
851
+ "loss": 2.5831,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.62,
856
+ "grad_norm": 5.90625,
857
+ "learning_rate": 0.00018090169943749476,
858
+ "loss": 2.6601,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.63,
863
+ "grad_norm": 6.6875,
864
+ "learning_rate": 0.00017880107536067218,
865
+ "loss": 2.5187,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.63,
870
+ "grad_norm": 5.53125,
871
+ "learning_rate": 0.0001766044443118978,
872
+ "loss": 2.5023,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.64,
877
+ "grad_norm": 7.84375,
878
+ "learning_rate": 0.00017431448254773944,
879
+ "loss": 2.6064,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.64,
884
+ "grad_norm": 7.65625,
885
+ "learning_rate": 0.0001719339800338651,
886
+ "loss": 2.5065,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.65,
891
+ "grad_norm": 6.15625,
892
+ "learning_rate": 0.00016946583704589973,
893
+ "loss": 2.6022,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.65,
898
+ "grad_norm": 5.15625,
899
+ "learning_rate": 0.00016691306063588583,
900
+ "loss": 2.5981,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.66,
905
+ "grad_norm": 5.28125,
906
+ "learning_rate": 0.00016427876096865394,
907
+ "loss": 2.6804,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.66,
912
+ "grad_norm": 5.71875,
913
+ "learning_rate": 0.0001615661475325658,
914
+ "loss": 2.7233,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 0.67,
919
+ "grad_norm": 3.84375,
920
+ "learning_rate": 0.00015877852522924732,
921
+ "loss": 2.4712,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 0.67,
926
+ "grad_norm": 5.9375,
927
+ "learning_rate": 0.0001559192903470747,
928
+ "loss": 2.4518,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 0.68,
933
+ "grad_norm": 4.53125,
934
+ "learning_rate": 0.0001529919264233205,
935
+ "loss": 2.5456,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 0.68,
940
+ "grad_norm": 5.4375,
941
+ "learning_rate": 0.00015000000000000001,
942
+ "loss": 2.4622,
943
+ "step": 130
944
+ },
945
+ {
946
+ "epoch": 0.69,
947
+ "grad_norm": 4.84375,
948
+ "learning_rate": 0.00014694715627858908,
949
+ "loss": 2.6133,
950
+ "step": 131
951
+ },
952
+ {
953
+ "epoch": 0.69,
954
+ "grad_norm": 5.65625,
955
+ "learning_rate": 0.00014383711467890774,
956
+ "loss": 2.4844,
957
+ "step": 132
958
+ },
959
+ {
960
+ "epoch": 0.7,
961
+ "grad_norm": 5.09375,
962
+ "learning_rate": 0.00014067366430758004,
963
+ "loss": 2.676,
964
+ "step": 133
965
+ },
966
+ {
967
+ "epoch": 0.71,
968
+ "grad_norm": 6.65625,
969
+ "learning_rate": 0.00013746065934159123,
970
+ "loss": 2.6896,
971
+ "step": 134
972
+ },
973
+ {
974
+ "epoch": 0.71,
975
+ "grad_norm": 5.5625,
976
+ "learning_rate": 0.00013420201433256689,
977
+ "loss": 2.5363,
978
+ "step": 135
979
+ },
980
+ {
981
+ "epoch": 0.72,
982
+ "grad_norm": 4.34375,
983
+ "learning_rate": 0.00013090169943749476,
984
+ "loss": 2.2558,
985
+ "step": 136
986
+ },
987
+ {
988
+ "epoch": 0.72,
989
+ "grad_norm": 6.53125,
990
+ "learning_rate": 0.0001275637355816999,
991
+ "loss": 2.5952,
992
+ "step": 137
993
+ },
994
+ {
995
+ "epoch": 0.73,
996
+ "grad_norm": 4.28125,
997
+ "learning_rate": 0.00012419218955996676,
998
+ "loss": 2.3442,
999
+ "step": 138
1000
+ },
1001
+ {
1002
+ "epoch": 0.73,
1003
+ "grad_norm": 4.28125,
1004
+ "learning_rate": 0.00012079116908177593,
1005
+ "loss": 2.4271,
1006
+ "step": 139
1007
+ },
1008
+ {
1009
+ "epoch": 0.74,
1010
+ "grad_norm": 4.28125,
1011
+ "learning_rate": 0.00011736481776669306,
1012
+ "loss": 2.4587,
1013
+ "step": 140
1014
+ },
1015
+ {
1016
+ "epoch": 0.74,
1017
+ "grad_norm": 3.671875,
1018
+ "learning_rate": 0.00011391731009600654,
1019
+ "loss": 2.5501,
1020
+ "step": 141
1021
+ },
1022
+ {
1023
+ "epoch": 0.75,
1024
+ "grad_norm": 5.375,
1025
+ "learning_rate": 0.00011045284632676536,
1026
+ "loss": 2.4936,
1027
+ "step": 142
1028
+ },
1029
+ {
1030
+ "epoch": 0.75,
1031
+ "grad_norm": 5.5625,
1032
+ "learning_rate": 0.00010697564737441252,
1033
+ "loss": 2.1856,
1034
+ "step": 143
1035
+ },
1036
+ {
1037
+ "epoch": 0.76,
1038
+ "grad_norm": 4.03125,
1039
+ "learning_rate": 0.00010348994967025012,
1040
+ "loss": 2.664,
1041
+ "step": 144
1042
+ },
1043
+ {
1044
+ "epoch": 0.76,
1045
+ "eval_loss": 2.378908395767212,
1046
+ "eval_runtime": 50.7719,
1047
+ "eval_samples_per_second": 20.76,
1048
+ "eval_steps_per_second": 20.76,
1049
+ "step": 144
1050
+ },
1051
+ {
1052
+ "epoch": 0.76,
1053
+ "grad_norm": 5.15625,
1054
+ "learning_rate": 0.0001,
1055
+ "loss": 2.4778,
1056
+ "step": 145
1057
+ },
1058
+ {
1059
+ "epoch": 0.77,
1060
+ "grad_norm": 4.625,
1061
+ "learning_rate": 9.651005032974994e-05,
1062
+ "loss": 2.388,
1063
+ "step": 146
1064
+ },
1065
+ {
1066
+ "epoch": 0.77,
1067
+ "grad_norm": 4.1875,
1068
+ "learning_rate": 9.302435262558747e-05,
1069
+ "loss": 2.4377,
1070
+ "step": 147
1071
+ },
1072
+ {
1073
+ "epoch": 0.78,
1074
+ "grad_norm": 6.40625,
1075
+ "learning_rate": 8.954715367323468e-05,
1076
+ "loss": 2.5183,
1077
+ "step": 148
1078
+ },
1079
+ {
1080
+ "epoch": 0.78,
1081
+ "grad_norm": 5.09375,
1082
+ "learning_rate": 8.608268990399349e-05,
1083
+ "loss": 2.5759,
1084
+ "step": 149
1085
+ },
1086
+ {
1087
+ "epoch": 0.79,
1088
+ "grad_norm": 4.4375,
1089
+ "learning_rate": 8.263518223330697e-05,
1090
+ "loss": 2.6291,
1091
+ "step": 150
1092
+ },
1093
+ {
1094
+ "epoch": 0.79,
1095
+ "grad_norm": 5.0625,
1096
+ "learning_rate": 7.920883091822408e-05,
1097
+ "loss": 2.3462,
1098
+ "step": 151
1099
+ },
1100
+ {
1101
+ "epoch": 0.8,
1102
+ "grad_norm": 3.875,
1103
+ "learning_rate": 7.580781044003324e-05,
1104
+ "loss": 2.1391,
1105
+ "step": 152
1106
+ },
1107
+ {
1108
+ "epoch": 0.81,
1109
+ "grad_norm": 3.625,
1110
+ "learning_rate": 7.243626441830009e-05,
1111
+ "loss": 2.3882,
1112
+ "step": 153
1113
+ },
1114
+ {
1115
+ "epoch": 0.81,
1116
+ "grad_norm": 4.21875,
1117
+ "learning_rate": 6.909830056250527e-05,
1118
+ "loss": 2.3264,
1119
+ "step": 154
1120
+ },
1121
+ {
1122
+ "epoch": 0.82,
1123
+ "grad_norm": 3.90625,
1124
+ "learning_rate": 6.579798566743314e-05,
1125
+ "loss": 2.2026,
1126
+ "step": 155
1127
+ },
1128
+ {
1129
+ "epoch": 0.82,
1130
+ "grad_norm": 5.75,
1131
+ "learning_rate": 6.25393406584088e-05,
1132
+ "loss": 2.5855,
1133
+ "step": 156
1134
+ },
1135
+ {
1136
+ "epoch": 0.83,
1137
+ "grad_norm": 4.6875,
1138
+ "learning_rate": 5.9326335692419995e-05,
1139
+ "loss": 2.2483,
1140
+ "step": 157
1141
+ },
1142
+ {
1143
+ "epoch": 0.83,
1144
+ "grad_norm": 3.65625,
1145
+ "learning_rate": 5.616288532109225e-05,
1146
+ "loss": 2.1449,
1147
+ "step": 158
1148
+ },
1149
+ {
1150
+ "epoch": 0.84,
1151
+ "grad_norm": 3.578125,
1152
+ "learning_rate": 5.305284372141095e-05,
1153
+ "loss": 2.2383,
1154
+ "step": 159
1155
+ },
1156
+ {
1157
+ "epoch": 0.84,
1158
+ "grad_norm": 3.65625,
1159
+ "learning_rate": 5.000000000000002e-05,
1160
+ "loss": 2.2851,
1161
+ "step": 160
1162
+ },
1163
+ {
1164
+ "epoch": 0.85,
1165
+ "grad_norm": 4.125,
1166
+ "learning_rate": 4.700807357667952e-05,
1167
+ "loss": 2.4617,
1168
+ "step": 161
1169
+ },
1170
+ {
1171
+ "epoch": 0.85,
1172
+ "grad_norm": 4.4375,
1173
+ "learning_rate": 4.4080709652925336e-05,
1174
+ "loss": 2.1831,
1175
+ "step": 162
1176
+ },
1177
+ {
1178
+ "epoch": 0.86,
1179
+ "grad_norm": 4.3125,
1180
+ "learning_rate": 4.12214747707527e-05,
1181
+ "loss": 2.2079,
1182
+ "step": 163
1183
+ },
1184
+ {
1185
+ "epoch": 0.86,
1186
+ "grad_norm": 3.28125,
1187
+ "learning_rate": 3.843385246743417e-05,
1188
+ "loss": 2.309,
1189
+ "step": 164
1190
+ },
1191
+ {
1192
+ "epoch": 0.87,
1193
+ "grad_norm": 5.15625,
1194
+ "learning_rate": 3.5721239031346066e-05,
1195
+ "loss": 2.4679,
1196
+ "step": 165
1197
+ },
1198
+ {
1199
+ "epoch": 0.87,
1200
+ "grad_norm": 5.09375,
1201
+ "learning_rate": 3.308693936411421e-05,
1202
+ "loss": 2.3249,
1203
+ "step": 166
1204
+ },
1205
+ {
1206
+ "epoch": 0.88,
1207
+ "grad_norm": 4.625,
1208
+ "learning_rate": 3.053416295410026e-05,
1209
+ "loss": 2.2908,
1210
+ "step": 167
1211
+ },
1212
+ {
1213
+ "epoch": 0.88,
1214
+ "grad_norm": 3.90625,
1215
+ "learning_rate": 2.8066019966134904e-05,
1216
+ "loss": 2.3121,
1217
+ "step": 168
1218
+ },
1219
+ {
1220
+ "epoch": 0.89,
1221
+ "grad_norm": 3.8125,
1222
+ "learning_rate": 2.5685517452260567e-05,
1223
+ "loss": 2.2577,
1224
+ "step": 169
1225
+ },
1226
+ {
1227
+ "epoch": 0.89,
1228
+ "grad_norm": 3.453125,
1229
+ "learning_rate": 2.339555568810221e-05,
1230
+ "loss": 2.2002,
1231
+ "step": 170
1232
+ },
1233
+ {
1234
+ "epoch": 0.9,
1235
+ "grad_norm": 3.84375,
1236
+ "learning_rate": 2.119892463932781e-05,
1237
+ "loss": 2.4345,
1238
+ "step": 171
1239
+ },
1240
+ {
1241
+ "epoch": 0.91,
1242
+ "grad_norm": 4.21875,
1243
+ "learning_rate": 1.9098300562505266e-05,
1244
+ "loss": 2.2662,
1245
+ "step": 172
1246
+ },
1247
+ {
1248
+ "epoch": 0.91,
1249
+ "grad_norm": 3.546875,
1250
+ "learning_rate": 1.7096242744495837e-05,
1251
+ "loss": 2.4411,
1252
+ "step": 173
1253
+ },
1254
+ {
1255
+ "epoch": 0.92,
1256
+ "grad_norm": 3.640625,
1257
+ "learning_rate": 1.5195190384357404e-05,
1258
+ "loss": 2.4609,
1259
+ "step": 174
1260
+ },
1261
+ {
1262
+ "epoch": 0.92,
1263
+ "grad_norm": 4.375,
1264
+ "learning_rate": 1.339745962155613e-05,
1265
+ "loss": 2.3615,
1266
+ "step": 175
1267
+ },
1268
+ {
1269
+ "epoch": 0.93,
1270
+ "grad_norm": 3.4375,
1271
+ "learning_rate": 1.1705240714107302e-05,
1272
+ "loss": 2.2071,
1273
+ "step": 176
1274
+ },
1275
+ {
1276
+ "epoch": 0.93,
1277
+ "grad_norm": 3.578125,
1278
+ "learning_rate": 1.0120595370083318e-05,
1279
+ "loss": 2.3504,
1280
+ "step": 177
1281
+ },
1282
+ {
1283
+ "epoch": 0.94,
1284
+ "grad_norm": 3.578125,
1285
+ "learning_rate": 8.645454235739903e-06,
1286
+ "loss": 2.2771,
1287
+ "step": 178
1288
+ },
1289
+ {
1290
+ "epoch": 0.94,
1291
+ "grad_norm": 3.5625,
1292
+ "learning_rate": 7.281614543321269e-06,
1293
+ "loss": 2.3663,
1294
+ "step": 179
1295
+ },
1296
+ {
1297
+ "epoch": 0.95,
1298
+ "grad_norm": 3.234375,
1299
+ "learning_rate": 6.030737921409169e-06,
1300
+ "loss": 2.4842,
1301
+ "step": 180
1302
+ },
1303
+ {
1304
+ "epoch": 0.95,
1305
+ "grad_norm": 3.5625,
1306
+ "learning_rate": 4.8943483704846475e-06,
1307
+ "loss": 2.3921,
1308
+ "step": 181
1309
+ },
1310
+ {
1311
+ "epoch": 0.96,
1312
+ "grad_norm": 3.484375,
1313
+ "learning_rate": 3.873830406168111e-06,
1314
+ "loss": 2.3467,
1315
+ "step": 182
1316
+ },
1317
+ {
1318
+ "epoch": 0.96,
1319
+ "grad_norm": 3.34375,
1320
+ "learning_rate": 2.970427372400353e-06,
1321
+ "loss": 2.3193,
1322
+ "step": 183
1323
+ },
1324
+ {
1325
+ "epoch": 0.97,
1326
+ "grad_norm": 3.71875,
1327
+ "learning_rate": 2.1852399266194314e-06,
1328
+ "loss": 2.3215,
1329
+ "step": 184
1330
+ },
1331
+ {
1332
+ "epoch": 0.97,
1333
+ "grad_norm": 3.5625,
1334
+ "learning_rate": 1.5192246987791981e-06,
1335
+ "loss": 2.1687,
1336
+ "step": 185
1337
+ },
1338
+ {
1339
+ "epoch": 0.98,
1340
+ "grad_norm": 3.359375,
1341
+ "learning_rate": 9.731931258429638e-07,
1342
+ "loss": 2.2471,
1343
+ "step": 186
1344
+ },
1345
+ {
1346
+ "epoch": 0.98,
1347
+ "grad_norm": 3.203125,
1348
+ "learning_rate": 5.478104631726711e-07,
1349
+ "loss": 2.356,
1350
+ "step": 187
1351
+ },
1352
+ {
1353
+ "epoch": 0.99,
1354
+ "grad_norm": 3.25,
1355
+ "learning_rate": 2.4359497401758024e-07,
1356
+ "loss": 2.2989,
1357
+ "step": 188
1358
+ },
1359
+ {
1360
+ "epoch": 0.99,
1361
+ "grad_norm": 3.15625,
1362
+ "learning_rate": 6.09172980904238e-08,
1363
+ "loss": 2.0376,
1364
+ "step": 189
1365
+ },
1366
+ {
1367
+ "epoch": 1.0,
1368
+ "grad_norm": 3.625,
1369
+ "learning_rate": 0.0,
1370
+ "loss": 2.5224,
1371
+ "step": 190
1372
+ }
1373
+ ],
1374
+ "logging_steps": 1,
1375
+ "max_steps": 190,
1376
+ "num_input_tokens_seen": 0,
1377
+ "num_train_epochs": 1,
1378
+ "save_steps": 500,
1379
+ "total_flos": 2415296753172480.0,
1380
+ "train_batch_size": 1,
1381
+ "trial_name": null,
1382
+ "trial_params": null
1383
+ }
checkpoint-190/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98e61ddf868ddad87b95a390b6b16ac7373daf85677008accc0f8d9873e6155e
3
+ size 5688
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 22,
18
+ "num_key_value_heads": 4,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": null,
22
+ "rope_theta": 10000.0,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": false,
27
+ "vocab_size": 32000
28
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 2048,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.38.2"
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f70c70bc9980c4fd3708e30d151889811203cc9ed15f550cf30bfb165e5d21f8
3
+ size 2200160278
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "right",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false,
43
+ "use_fast": true
44
+ }