nrshoudi commited on
Commit
3dd7bb7
1 Parent(s): 0fb1079

End of training

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: openai/whisper-small
7
+ model-index:
8
+ - name: Whisper-small-Ar-MDD
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # Whisper-small-Ar-MDD
16
+
17
+ This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.2137
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.001
39
+ - train_batch_size: 6
40
+ - eval_batch_size: 6
41
+ - seed: 42
42
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
+ - lr_scheduler_type: linear
44
+ - lr_scheduler_warmup_steps: 50
45
+ - num_epochs: 10
46
+ - mixed_precision_training: Native AMP
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss |
51
+ |:-------------:|:-----:|:----:|:---------------:|
52
+ | 0.0681 | 1.0 | 546 | 0.1955 |
53
+ | 0.0367 | 2.0 | 1092 | 0.1992 |
54
+ | 0.0382 | 3.0 | 1638 | 0.1857 |
55
+ | 0.0189 | 4.0 | 2184 | 0.1970 |
56
+ | 0.0274 | 5.0 | 2730 | 0.1894 |
57
+ | 0.02 | 6.0 | 3276 | 0.1877 |
58
+ | 0.0087 | 7.0 | 3822 | 0.1908 |
59
+ | 0.0066 | 8.0 | 4368 | 0.2085 |
60
+ | 0.0055 | 9.0 | 4914 | 0.2100 |
61
+ | 0.0013 | 10.0 | 5460 | 0.2137 |
62
+
63
+
64
+ ### Framework versions
65
+
66
+ - PEFT 0.10.0
67
+ - Transformers 4.38.2
68
+ - Pytorch 2.2.1+cu121
69
+ - Datasets 2.18.0
70
+ - Tokenizers 0.15.2
adapter_1/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "WhisperForConditionalGeneration",
5
+ "parent_library": "transformers.models.whisper.modeling_whisper"
6
+ },
7
+ "base_model_name_or_path": "openai/whisper-small",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q_proj",
27
+ "v_proj"
28
+ ],
29
+ "task_type": null,
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
adapter_1/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb0ebdc47891ae516deaeab040b653abd88b2dceb9990155159ce05013d93b9
3
+ size 14176064
adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "WhisperForConditionalGeneration",
5
+ "parent_library": "transformers.models.whisper.modeling_whisper"
6
+ },
7
+ "base_model_name_or_path": "openai/whisper-small",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q_proj",
27
+ "v_proj"
28
+ ],
29
+ "task_type": null,
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7195721032862164c68a850217a7bbc5a0df6dd26266eb5ee8c195bfb57721
3
+ size 14176064
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
trainer_state.json ADDED
@@ -0,0 +1,1636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5460,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05,
13
+ "grad_norm": Infinity,
14
+ "learning_rate": 0.00044,
15
+ "loss": 4.1397,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.09,
20
+ "grad_norm": 0.8320172429084778,
21
+ "learning_rate": 0.00094,
22
+ "loss": 1.0739,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.14,
27
+ "grad_norm": 1.5708420276641846,
28
+ "learning_rate": 0.0009959334565619224,
29
+ "loss": 0.4997,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.18,
34
+ "grad_norm": 0.5893439650535583,
35
+ "learning_rate": 0.000991312384473198,
36
+ "loss": 0.1246,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.23,
41
+ "grad_norm": 2.0901906490325928,
42
+ "learning_rate": 0.0009866913123844732,
43
+ "loss": 0.1273,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.27,
48
+ "grad_norm": 0.9590554237365723,
49
+ "learning_rate": 0.0009820702402957486,
50
+ "loss": 0.1257,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.32,
55
+ "grad_norm": 1.531774878501892,
56
+ "learning_rate": 0.000977449168207024,
57
+ "loss": 0.1024,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.37,
62
+ "grad_norm": 1.2284561395645142,
63
+ "learning_rate": 0.0009728280961182994,
64
+ "loss": 0.1041,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.41,
69
+ "grad_norm": 1.0752886533737183,
70
+ "learning_rate": 0.0009682070240295749,
71
+ "loss": 0.1225,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 0.46,
76
+ "grad_norm": 0.7119214534759521,
77
+ "learning_rate": 0.0009635859519408503,
78
+ "loss": 0.0904,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 0.5,
83
+ "grad_norm": 1.0164552927017212,
84
+ "learning_rate": 0.0009589648798521257,
85
+ "loss": 0.0773,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 0.55,
90
+ "grad_norm": 0.4222384989261627,
91
+ "learning_rate": 0.0009543438077634012,
92
+ "loss": 0.2081,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 0.6,
97
+ "grad_norm": 0.17551083862781525,
98
+ "learning_rate": 0.0009497227356746766,
99
+ "loss": 0.0863,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 0.64,
104
+ "grad_norm": 0.41848981380462646,
105
+ "learning_rate": 0.000945101663585952,
106
+ "loss": 0.0632,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.69,
111
+ "grad_norm": 0.7539293766021729,
112
+ "learning_rate": 0.0009404805914972274,
113
+ "loss": 0.077,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 0.73,
118
+ "grad_norm": 0.3750676214694977,
119
+ "learning_rate": 0.0009358595194085028,
120
+ "loss": 0.0948,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.78,
125
+ "grad_norm": 0.33498436212539673,
126
+ "learning_rate": 0.0009312384473197783,
127
+ "loss": 0.077,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 0.82,
132
+ "grad_norm": 0.43420735001564026,
133
+ "learning_rate": 0.0009266173752310536,
134
+ "loss": 0.0729,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 0.87,
139
+ "grad_norm": 1.0590511560440063,
140
+ "learning_rate": 0.0009219963031423291,
141
+ "loss": 0.0816,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 0.92,
146
+ "grad_norm": 0.34223881363868713,
147
+ "learning_rate": 0.0009173752310536044,
148
+ "loss": 0.0567,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 0.96,
153
+ "grad_norm": 0.26913997530937195,
154
+ "learning_rate": 0.0009127541589648799,
155
+ "loss": 0.0681,
156
+ "step": 525
157
+ },
158
+ {
159
+ "epoch": 1.0,
160
+ "eval_loss": 0.19546650350093842,
161
+ "eval_runtime": 175.4508,
162
+ "eval_samples_per_second": 4.634,
163
+ "eval_steps_per_second": 0.775,
164
+ "step": 546
165
+ },
166
+ {
167
+ "epoch": 1.01,
168
+ "grad_norm": 0.41807565093040466,
169
+ "learning_rate": 0.0009081330868761552,
170
+ "loss": 0.0365,
171
+ "step": 550
172
+ },
173
+ {
174
+ "epoch": 1.05,
175
+ "grad_norm": 0.8459017872810364,
176
+ "learning_rate": 0.0009035120147874307,
177
+ "loss": 0.0481,
178
+ "step": 575
179
+ },
180
+ {
181
+ "epoch": 1.1,
182
+ "grad_norm": 0.21556143462657928,
183
+ "learning_rate": 0.000898890942698706,
184
+ "loss": 0.0426,
185
+ "step": 600
186
+ },
187
+ {
188
+ "epoch": 1.14,
189
+ "grad_norm": 0.06540340185165405,
190
+ "learning_rate": 0.0008942698706099815,
191
+ "loss": 0.0435,
192
+ "step": 625
193
+ },
194
+ {
195
+ "epoch": 1.19,
196
+ "grad_norm": 0.8853626251220703,
197
+ "learning_rate": 0.0008896487985212569,
198
+ "loss": 0.0668,
199
+ "step": 650
200
+ },
201
+ {
202
+ "epoch": 1.24,
203
+ "grad_norm": 0.2227557897567749,
204
+ "learning_rate": 0.0008850277264325323,
205
+ "loss": 0.0373,
206
+ "step": 675
207
+ },
208
+ {
209
+ "epoch": 1.28,
210
+ "grad_norm": 0.5395527482032776,
211
+ "learning_rate": 0.0008804066543438077,
212
+ "loss": 0.0505,
213
+ "step": 700
214
+ },
215
+ {
216
+ "epoch": 1.33,
217
+ "grad_norm": 0.47810012102127075,
218
+ "learning_rate": 0.0008757855822550833,
219
+ "loss": 0.0619,
220
+ "step": 725
221
+ },
222
+ {
223
+ "epoch": 1.37,
224
+ "grad_norm": 0.41824525594711304,
225
+ "learning_rate": 0.0008711645101663586,
226
+ "loss": 0.0505,
227
+ "step": 750
228
+ },
229
+ {
230
+ "epoch": 1.42,
231
+ "grad_norm": 0.35845717787742615,
232
+ "learning_rate": 0.0008665434380776341,
233
+ "loss": 0.0556,
234
+ "step": 775
235
+ },
236
+ {
237
+ "epoch": 1.47,
238
+ "grad_norm": 0.591626763343811,
239
+ "learning_rate": 0.0008619223659889095,
240
+ "loss": 0.0452,
241
+ "step": 800
242
+ },
243
+ {
244
+ "epoch": 1.51,
245
+ "grad_norm": 0.52753746509552,
246
+ "learning_rate": 0.0008573012939001849,
247
+ "loss": 0.0413,
248
+ "step": 825
249
+ },
250
+ {
251
+ "epoch": 1.56,
252
+ "grad_norm": 0.17933356761932373,
253
+ "learning_rate": 0.0008526802218114603,
254
+ "loss": 0.0264,
255
+ "step": 850
256
+ },
257
+ {
258
+ "epoch": 1.6,
259
+ "grad_norm": 0.4725402891635895,
260
+ "learning_rate": 0.0008480591497227357,
261
+ "loss": 0.0503,
262
+ "step": 875
263
+ },
264
+ {
265
+ "epoch": 1.65,
266
+ "grad_norm": 0.43168240785598755,
267
+ "learning_rate": 0.0008434380776340112,
268
+ "loss": 0.0543,
269
+ "step": 900
270
+ },
271
+ {
272
+ "epoch": 1.69,
273
+ "grad_norm": 0.15935927629470825,
274
+ "learning_rate": 0.0008388170055452865,
275
+ "loss": 0.0625,
276
+ "step": 925
277
+ },
278
+ {
279
+ "epoch": 1.74,
280
+ "grad_norm": 0.1527830809354782,
281
+ "learning_rate": 0.000834195933456562,
282
+ "loss": 0.0338,
283
+ "step": 950
284
+ },
285
+ {
286
+ "epoch": 1.79,
287
+ "grad_norm": 0.6140448451042175,
288
+ "learning_rate": 0.0008295748613678373,
289
+ "loss": 0.0391,
290
+ "step": 975
291
+ },
292
+ {
293
+ "epoch": 1.83,
294
+ "grad_norm": 0.34989482164382935,
295
+ "learning_rate": 0.0008249537892791128,
296
+ "loss": 0.0428,
297
+ "step": 1000
298
+ },
299
+ {
300
+ "epoch": 1.88,
301
+ "grad_norm": 1.364334225654602,
302
+ "learning_rate": 0.0008203327171903881,
303
+ "loss": 0.0452,
304
+ "step": 1025
305
+ },
306
+ {
307
+ "epoch": 1.92,
308
+ "grad_norm": 0.4410119354724884,
309
+ "learning_rate": 0.0008157116451016636,
310
+ "loss": 0.0519,
311
+ "step": 1050
312
+ },
313
+ {
314
+ "epoch": 1.97,
315
+ "grad_norm": 0.018156496807932854,
316
+ "learning_rate": 0.000811090573012939,
317
+ "loss": 0.0367,
318
+ "step": 1075
319
+ },
320
+ {
321
+ "epoch": 2.0,
322
+ "eval_loss": 0.19916145503520966,
323
+ "eval_runtime": 174.1191,
324
+ "eval_samples_per_second": 4.669,
325
+ "eval_steps_per_second": 0.781,
326
+ "step": 1092
327
+ },
328
+ {
329
+ "epoch": 2.01,
330
+ "grad_norm": 0.2884193956851959,
331
+ "learning_rate": 0.0008064695009242144,
332
+ "loss": 0.0437,
333
+ "step": 1100
334
+ },
335
+ {
336
+ "epoch": 2.06,
337
+ "grad_norm": 0.14955410361289978,
338
+ "learning_rate": 0.0008018484288354898,
339
+ "loss": 0.0332,
340
+ "step": 1125
341
+ },
342
+ {
343
+ "epoch": 2.11,
344
+ "grad_norm": 0.016795417293906212,
345
+ "learning_rate": 0.0007972273567467652,
346
+ "loss": 0.0614,
347
+ "step": 1150
348
+ },
349
+ {
350
+ "epoch": 2.15,
351
+ "grad_norm": 0.272935688495636,
352
+ "learning_rate": 0.0007926062846580406,
353
+ "loss": 0.0354,
354
+ "step": 1175
355
+ },
356
+ {
357
+ "epoch": 2.2,
358
+ "grad_norm": 0.181545689702034,
359
+ "learning_rate": 0.0007879852125693162,
360
+ "loss": 0.0477,
361
+ "step": 1200
362
+ },
363
+ {
364
+ "epoch": 2.24,
365
+ "grad_norm": 0.14347945153713226,
366
+ "learning_rate": 0.0007833641404805915,
367
+ "loss": 0.0285,
368
+ "step": 1225
369
+ },
370
+ {
371
+ "epoch": 2.29,
372
+ "grad_norm": 0.1175965741276741,
373
+ "learning_rate": 0.000778743068391867,
374
+ "loss": 0.0398,
375
+ "step": 1250
376
+ },
377
+ {
378
+ "epoch": 2.34,
379
+ "grad_norm": 0.089854396879673,
380
+ "learning_rate": 0.0007741219963031424,
381
+ "loss": 0.0473,
382
+ "step": 1275
383
+ },
384
+ {
385
+ "epoch": 2.38,
386
+ "grad_norm": 0.3998129665851593,
387
+ "learning_rate": 0.0007695009242144178,
388
+ "loss": 0.0425,
389
+ "step": 1300
390
+ },
391
+ {
392
+ "epoch": 2.43,
393
+ "grad_norm": 0.42039960622787476,
394
+ "learning_rate": 0.0007648798521256932,
395
+ "loss": 0.0404,
396
+ "step": 1325
397
+ },
398
+ {
399
+ "epoch": 2.47,
400
+ "grad_norm": 0.3940460979938507,
401
+ "learning_rate": 0.0007602587800369686,
402
+ "loss": 0.0381,
403
+ "step": 1350
404
+ },
405
+ {
406
+ "epoch": 2.52,
407
+ "grad_norm": 0.37924668192863464,
408
+ "learning_rate": 0.0007556377079482441,
409
+ "loss": 0.0397,
410
+ "step": 1375
411
+ },
412
+ {
413
+ "epoch": 2.56,
414
+ "grad_norm": 0.46505168080329895,
415
+ "learning_rate": 0.0007510166358595194,
416
+ "loss": 0.0348,
417
+ "step": 1400
418
+ },
419
+ {
420
+ "epoch": 2.61,
421
+ "grad_norm": 0.2604403495788574,
422
+ "learning_rate": 0.0007463955637707949,
423
+ "loss": 0.036,
424
+ "step": 1425
425
+ },
426
+ {
427
+ "epoch": 2.66,
428
+ "grad_norm": 0.4742681384086609,
429
+ "learning_rate": 0.0007417744916820702,
430
+ "loss": 0.0216,
431
+ "step": 1450
432
+ },
433
+ {
434
+ "epoch": 2.7,
435
+ "grad_norm": 0.5116605162620544,
436
+ "learning_rate": 0.0007371534195933457,
437
+ "loss": 0.0281,
438
+ "step": 1475
439
+ },
440
+ {
441
+ "epoch": 2.75,
442
+ "grad_norm": 0.8539583683013916,
443
+ "learning_rate": 0.000732532347504621,
444
+ "loss": 0.0347,
445
+ "step": 1500
446
+ },
447
+ {
448
+ "epoch": 2.79,
449
+ "grad_norm": 1.8664207458496094,
450
+ "learning_rate": 0.0007279112754158965,
451
+ "loss": 0.045,
452
+ "step": 1525
453
+ },
454
+ {
455
+ "epoch": 2.84,
456
+ "grad_norm": 0.18790672719478607,
457
+ "learning_rate": 0.0007232902033271719,
458
+ "loss": 0.0233,
459
+ "step": 1550
460
+ },
461
+ {
462
+ "epoch": 2.88,
463
+ "grad_norm": 0.31298670172691345,
464
+ "learning_rate": 0.0007186691312384473,
465
+ "loss": 0.0337,
466
+ "step": 1575
467
+ },
468
+ {
469
+ "epoch": 2.93,
470
+ "grad_norm": 0.8353794813156128,
471
+ "learning_rate": 0.0007140480591497227,
472
+ "loss": 0.0387,
473
+ "step": 1600
474
+ },
475
+ {
476
+ "epoch": 2.98,
477
+ "grad_norm": 0.08966954797506332,
478
+ "learning_rate": 0.0007094269870609981,
479
+ "loss": 0.0382,
480
+ "step": 1625
481
+ },
482
+ {
483
+ "epoch": 3.0,
484
+ "eval_loss": 0.18573200702667236,
485
+ "eval_runtime": 175.912,
486
+ "eval_samples_per_second": 4.622,
487
+ "eval_steps_per_second": 0.773,
488
+ "step": 1638
489
+ },
490
+ {
491
+ "epoch": 3.02,
492
+ "grad_norm": 0.039618588984012604,
493
+ "learning_rate": 0.0007048059149722735,
494
+ "loss": 0.0353,
495
+ "step": 1650
496
+ },
497
+ {
498
+ "epoch": 3.07,
499
+ "grad_norm": 0.3156013488769531,
500
+ "learning_rate": 0.000700184842883549,
501
+ "loss": 0.0318,
502
+ "step": 1675
503
+ },
504
+ {
505
+ "epoch": 3.11,
506
+ "grad_norm": 1.9033042192459106,
507
+ "learning_rate": 0.0006955637707948245,
508
+ "loss": 0.0273,
509
+ "step": 1700
510
+ },
511
+ {
512
+ "epoch": 3.16,
513
+ "grad_norm": 0.32316115498542786,
514
+ "learning_rate": 0.0006909426987060999,
515
+ "loss": 0.035,
516
+ "step": 1725
517
+ },
518
+ {
519
+ "epoch": 3.21,
520
+ "grad_norm": 0.5656726956367493,
521
+ "learning_rate": 0.0006863216266173753,
522
+ "loss": 0.0311,
523
+ "step": 1750
524
+ },
525
+ {
526
+ "epoch": 3.25,
527
+ "grad_norm": 0.032537270337343216,
528
+ "learning_rate": 0.0006817005545286507,
529
+ "loss": 0.0274,
530
+ "step": 1775
531
+ },
532
+ {
533
+ "epoch": 3.3,
534
+ "grad_norm": 0.30572062730789185,
535
+ "learning_rate": 0.0006770794824399261,
536
+ "loss": 0.0362,
537
+ "step": 1800
538
+ },
539
+ {
540
+ "epoch": 3.34,
541
+ "grad_norm": 0.3374157249927521,
542
+ "learning_rate": 0.0006724584103512015,
543
+ "loss": 0.0305,
544
+ "step": 1825
545
+ },
546
+ {
547
+ "epoch": 3.39,
548
+ "grad_norm": 0.1089138388633728,
549
+ "learning_rate": 0.000667837338262477,
550
+ "loss": 0.0301,
551
+ "step": 1850
552
+ },
553
+ {
554
+ "epoch": 3.43,
555
+ "grad_norm": 0.10849720984697342,
556
+ "learning_rate": 0.0006632162661737523,
557
+ "loss": 0.0241,
558
+ "step": 1875
559
+ },
560
+ {
561
+ "epoch": 3.48,
562
+ "grad_norm": 0.11349553614854813,
563
+ "learning_rate": 0.0006585951940850278,
564
+ "loss": 0.0218,
565
+ "step": 1900
566
+ },
567
+ {
568
+ "epoch": 3.53,
569
+ "grad_norm": 0.3237963616847992,
570
+ "learning_rate": 0.0006539741219963031,
571
+ "loss": 0.019,
572
+ "step": 1925
573
+ },
574
+ {
575
+ "epoch": 3.57,
576
+ "grad_norm": 0.5101845860481262,
577
+ "learning_rate": 0.0006493530499075786,
578
+ "loss": 0.0244,
579
+ "step": 1950
580
+ },
581
+ {
582
+ "epoch": 3.62,
583
+ "grad_norm": 0.010137775912880898,
584
+ "learning_rate": 0.0006447319778188539,
585
+ "loss": 0.0304,
586
+ "step": 1975
587
+ },
588
+ {
589
+ "epoch": 3.66,
590
+ "grad_norm": 0.8954480886459351,
591
+ "learning_rate": 0.0006401109057301294,
592
+ "loss": 0.0306,
593
+ "step": 2000
594
+ },
595
+ {
596
+ "epoch": 3.71,
597
+ "grad_norm": 0.014889650978147984,
598
+ "learning_rate": 0.0006354898336414048,
599
+ "loss": 0.0251,
600
+ "step": 2025
601
+ },
602
+ {
603
+ "epoch": 3.75,
604
+ "grad_norm": 0.0878029614686966,
605
+ "learning_rate": 0.0006308687615526802,
606
+ "loss": 0.0341,
607
+ "step": 2050
608
+ },
609
+ {
610
+ "epoch": 3.8,
611
+ "grad_norm": 0.13351218402385712,
612
+ "learning_rate": 0.0006262476894639556,
613
+ "loss": 0.0246,
614
+ "step": 2075
615
+ },
616
+ {
617
+ "epoch": 3.85,
618
+ "grad_norm": 0.3208947479724884,
619
+ "learning_rate": 0.000621626617375231,
620
+ "loss": 0.0248,
621
+ "step": 2100
622
+ },
623
+ {
624
+ "epoch": 3.89,
625
+ "grad_norm": 0.42570358514785767,
626
+ "learning_rate": 0.0006170055452865064,
627
+ "loss": 0.0358,
628
+ "step": 2125
629
+ },
630
+ {
631
+ "epoch": 3.94,
632
+ "grad_norm": 0.131515234708786,
633
+ "learning_rate": 0.000612384473197782,
634
+ "loss": 0.0355,
635
+ "step": 2150
636
+ },
637
+ {
638
+ "epoch": 3.98,
639
+ "grad_norm": 0.12344180792570114,
640
+ "learning_rate": 0.0006077634011090574,
641
+ "loss": 0.0189,
642
+ "step": 2175
643
+ },
644
+ {
645
+ "epoch": 4.0,
646
+ "eval_loss": 0.19701677560806274,
647
+ "eval_runtime": 176.7126,
648
+ "eval_samples_per_second": 4.601,
649
+ "eval_steps_per_second": 0.77,
650
+ "step": 2184
651
+ },
652
+ {
653
+ "epoch": 4.03,
654
+ "grad_norm": 0.10986749082803726,
655
+ "learning_rate": 0.0006031423290203328,
656
+ "loss": 0.0259,
657
+ "step": 2200
658
+ },
659
+ {
660
+ "epoch": 4.08,
661
+ "grad_norm": 0.46528518199920654,
662
+ "learning_rate": 0.0005985212569316082,
663
+ "loss": 0.022,
664
+ "step": 2225
665
+ },
666
+ {
667
+ "epoch": 4.12,
668
+ "grad_norm": 0.2069913148880005,
669
+ "learning_rate": 0.0005939001848428836,
670
+ "loss": 0.015,
671
+ "step": 2250
672
+ },
673
+ {
674
+ "epoch": 4.17,
675
+ "grad_norm": 0.34658578038215637,
676
+ "learning_rate": 0.000589279112754159,
677
+ "loss": 0.0299,
678
+ "step": 2275
679
+ },
680
+ {
681
+ "epoch": 4.21,
682
+ "grad_norm": 0.18868118524551392,
683
+ "learning_rate": 0.0005846580406654344,
684
+ "loss": 0.0174,
685
+ "step": 2300
686
+ },
687
+ {
688
+ "epoch": 4.26,
689
+ "grad_norm": 0.33069688081741333,
690
+ "learning_rate": 0.0005800369685767099,
691
+ "loss": 0.0216,
692
+ "step": 2325
693
+ },
694
+ {
695
+ "epoch": 4.3,
696
+ "grad_norm": 0.7511343955993652,
697
+ "learning_rate": 0.0005754158964879852,
698
+ "loss": 0.0157,
699
+ "step": 2350
700
+ },
701
+ {
702
+ "epoch": 4.35,
703
+ "grad_norm": 0.27277225255966187,
704
+ "learning_rate": 0.0005707948243992607,
705
+ "loss": 0.0198,
706
+ "step": 2375
707
+ },
708
+ {
709
+ "epoch": 4.4,
710
+ "grad_norm": 2.3098878860473633,
711
+ "learning_rate": 0.000566173752310536,
712
+ "loss": 0.026,
713
+ "step": 2400
714
+ },
715
+ {
716
+ "epoch": 4.44,
717
+ "grad_norm": 0.39823707938194275,
718
+ "learning_rate": 0.0005615526802218115,
719
+ "loss": 0.0118,
720
+ "step": 2425
721
+ },
722
+ {
723
+ "epoch": 4.49,
724
+ "grad_norm": 0.2773701250553131,
725
+ "learning_rate": 0.0005569316081330868,
726
+ "loss": 0.0319,
727
+ "step": 2450
728
+ },
729
+ {
730
+ "epoch": 4.53,
731
+ "grad_norm": 0.2549929916858673,
732
+ "learning_rate": 0.0005523105360443623,
733
+ "loss": 0.0164,
734
+ "step": 2475
735
+ },
736
+ {
737
+ "epoch": 4.58,
738
+ "grad_norm": 3.1059272289276123,
739
+ "learning_rate": 0.0005476894639556377,
740
+ "loss": 0.0231,
741
+ "step": 2500
742
+ },
743
+ {
744
+ "epoch": 4.62,
745
+ "grad_norm": 0.10516056418418884,
746
+ "learning_rate": 0.0005430683918669131,
747
+ "loss": 0.0262,
748
+ "step": 2525
749
+ },
750
+ {
751
+ "epoch": 4.67,
752
+ "grad_norm": 0.046087902039289474,
753
+ "learning_rate": 0.0005384473197781885,
754
+ "loss": 0.0212,
755
+ "step": 2550
756
+ },
757
+ {
758
+ "epoch": 4.72,
759
+ "grad_norm": 0.9207663536071777,
760
+ "learning_rate": 0.0005338262476894639,
761
+ "loss": 0.018,
762
+ "step": 2575
763
+ },
764
+ {
765
+ "epoch": 4.76,
766
+ "grad_norm": 0.5687919855117798,
767
+ "learning_rate": 0.0005292051756007393,
768
+ "loss": 0.0255,
769
+ "step": 2600
770
+ },
771
+ {
772
+ "epoch": 4.81,
773
+ "grad_norm": 0.006184098310768604,
774
+ "learning_rate": 0.0005245841035120147,
775
+ "loss": 0.0207,
776
+ "step": 2625
777
+ },
778
+ {
779
+ "epoch": 4.85,
780
+ "grad_norm": 0.5442487597465515,
781
+ "learning_rate": 0.0005199630314232903,
782
+ "loss": 0.0192,
783
+ "step": 2650
784
+ },
785
+ {
786
+ "epoch": 4.9,
787
+ "grad_norm": 0.031753990799188614,
788
+ "learning_rate": 0.0005153419593345657,
789
+ "loss": 0.015,
790
+ "step": 2675
791
+ },
792
+ {
793
+ "epoch": 4.95,
794
+ "grad_norm": 0.022051149979233742,
795
+ "learning_rate": 0.0005107208872458411,
796
+ "loss": 0.0252,
797
+ "step": 2700
798
+ },
799
+ {
800
+ "epoch": 4.99,
801
+ "grad_norm": 0.17456993460655212,
802
+ "learning_rate": 0.0005060998151571165,
803
+ "loss": 0.0274,
804
+ "step": 2725
805
+ },
806
+ {
807
+ "epoch": 5.0,
808
+ "eval_loss": 0.18937160074710846,
809
+ "eval_runtime": 177.0599,
810
+ "eval_samples_per_second": 4.592,
811
+ "eval_steps_per_second": 0.768,
812
+ "step": 2730
813
+ },
814
+ {
815
+ "epoch": 5.04,
816
+ "grad_norm": 0.643435537815094,
817
+ "learning_rate": 0.0005014787430683919,
818
+ "loss": 0.0211,
819
+ "step": 2750
820
+ },
821
+ {
822
+ "epoch": 5.08,
823
+ "grad_norm": 0.35862746834754944,
824
+ "learning_rate": 0.0004968576709796673,
825
+ "loss": 0.0073,
826
+ "step": 2775
827
+ },
828
+ {
829
+ "epoch": 5.13,
830
+ "grad_norm": 0.5732066035270691,
831
+ "learning_rate": 0.0004922365988909427,
832
+ "loss": 0.0083,
833
+ "step": 2800
834
+ },
835
+ {
836
+ "epoch": 5.17,
837
+ "grad_norm": 0.21464449167251587,
838
+ "learning_rate": 0.0004876155268022181,
839
+ "loss": 0.0104,
840
+ "step": 2825
841
+ },
842
+ {
843
+ "epoch": 5.22,
844
+ "grad_norm": 0.1674581915140152,
845
+ "learning_rate": 0.0004829944547134935,
846
+ "loss": 0.0093,
847
+ "step": 2850
848
+ },
849
+ {
850
+ "epoch": 5.27,
851
+ "grad_norm": 0.03593946248292923,
852
+ "learning_rate": 0.000478373382624769,
853
+ "loss": 0.0119,
854
+ "step": 2875
855
+ },
856
+ {
857
+ "epoch": 5.31,
858
+ "grad_norm": 0.18074722588062286,
859
+ "learning_rate": 0.0004737523105360444,
860
+ "loss": 0.0097,
861
+ "step": 2900
862
+ },
863
+ {
864
+ "epoch": 5.36,
865
+ "grad_norm": 0.06277300417423248,
866
+ "learning_rate": 0.0004691312384473198,
867
+ "loss": 0.0137,
868
+ "step": 2925
869
+ },
870
+ {
871
+ "epoch": 5.4,
872
+ "grad_norm": 0.20016886293888092,
873
+ "learning_rate": 0.0004645101663585952,
874
+ "loss": 0.0204,
875
+ "step": 2950
876
+ },
877
+ {
878
+ "epoch": 5.45,
879
+ "grad_norm": 0.1815144419670105,
880
+ "learning_rate": 0.0004598890942698706,
881
+ "loss": 0.0162,
882
+ "step": 2975
883
+ },
884
+ {
885
+ "epoch": 5.49,
886
+ "grad_norm": 0.5112192034721375,
887
+ "learning_rate": 0.00045526802218114607,
888
+ "loss": 0.0131,
889
+ "step": 3000
890
+ },
891
+ {
892
+ "epoch": 5.54,
893
+ "grad_norm": 0.1796441674232483,
894
+ "learning_rate": 0.0004506469500924215,
895
+ "loss": 0.0176,
896
+ "step": 3025
897
+ },
898
+ {
899
+ "epoch": 5.59,
900
+ "grad_norm": 0.4108269214630127,
901
+ "learning_rate": 0.0004460258780036969,
902
+ "loss": 0.0196,
903
+ "step": 3050
904
+ },
905
+ {
906
+ "epoch": 5.63,
907
+ "grad_norm": 0.4271663427352905,
908
+ "learning_rate": 0.0004414048059149723,
909
+ "loss": 0.017,
910
+ "step": 3075
911
+ },
912
+ {
913
+ "epoch": 5.68,
914
+ "grad_norm": 0.2981961667537689,
915
+ "learning_rate": 0.0004367837338262477,
916
+ "loss": 0.012,
917
+ "step": 3100
918
+ },
919
+ {
920
+ "epoch": 5.72,
921
+ "grad_norm": 0.392818808555603,
922
+ "learning_rate": 0.0004321626617375231,
923
+ "loss": 0.0115,
924
+ "step": 3125
925
+ },
926
+ {
927
+ "epoch": 5.77,
928
+ "grad_norm": 0.00586000457406044,
929
+ "learning_rate": 0.0004275415896487985,
930
+ "loss": 0.0115,
931
+ "step": 3150
932
+ },
933
+ {
934
+ "epoch": 5.82,
935
+ "grad_norm": 0.2224288433790207,
936
+ "learning_rate": 0.0004229205175600739,
937
+ "loss": 0.0127,
938
+ "step": 3175
939
+ },
940
+ {
941
+ "epoch": 5.86,
942
+ "grad_norm": 0.28421640396118164,
943
+ "learning_rate": 0.00041829944547134933,
944
+ "loss": 0.0124,
945
+ "step": 3200
946
+ },
947
+ {
948
+ "epoch": 5.91,
949
+ "grad_norm": 0.3791782557964325,
950
+ "learning_rate": 0.00041367837338262474,
951
+ "loss": 0.016,
952
+ "step": 3225
953
+ },
954
+ {
955
+ "epoch": 5.95,
956
+ "grad_norm": 0.12688513100147247,
957
+ "learning_rate": 0.0004090573012939002,
958
+ "loss": 0.0212,
959
+ "step": 3250
960
+ },
961
+ {
962
+ "epoch": 6.0,
963
+ "grad_norm": 0.009004692547023296,
964
+ "learning_rate": 0.0004044362292051756,
965
+ "loss": 0.02,
966
+ "step": 3275
967
+ },
968
+ {
969
+ "epoch": 6.0,
970
+ "eval_loss": 0.18766650557518005,
971
+ "eval_runtime": 177.5092,
972
+ "eval_samples_per_second": 4.58,
973
+ "eval_steps_per_second": 0.766,
974
+ "step": 3276
975
+ },
976
+ {
977
+ "epoch": 6.04,
978
+ "grad_norm": 0.014529082924127579,
979
+ "learning_rate": 0.000399815157116451,
980
+ "loss": 0.012,
981
+ "step": 3300
982
+ },
983
+ {
984
+ "epoch": 6.09,
985
+ "grad_norm": 0.16003918647766113,
986
+ "learning_rate": 0.0003951940850277264,
987
+ "loss": 0.0078,
988
+ "step": 3325
989
+ },
990
+ {
991
+ "epoch": 6.14,
992
+ "grad_norm": 0.042826466262340546,
993
+ "learning_rate": 0.0003905730129390019,
994
+ "loss": 0.0116,
995
+ "step": 3350
996
+ },
997
+ {
998
+ "epoch": 6.18,
999
+ "grad_norm": 0.0034067954402416945,
1000
+ "learning_rate": 0.0003859519408502773,
1001
+ "loss": 0.006,
1002
+ "step": 3375
1003
+ },
1004
+ {
1005
+ "epoch": 6.23,
1006
+ "grad_norm": 0.005681981332600117,
1007
+ "learning_rate": 0.0003813308687615527,
1008
+ "loss": 0.0088,
1009
+ "step": 3400
1010
+ },
1011
+ {
1012
+ "epoch": 6.27,
1013
+ "grad_norm": 0.05403963476419449,
1014
+ "learning_rate": 0.0003767097966728281,
1015
+ "loss": 0.0104,
1016
+ "step": 3425
1017
+ },
1018
+ {
1019
+ "epoch": 6.32,
1020
+ "grad_norm": 0.1421121209859848,
1021
+ "learning_rate": 0.0003720887245841035,
1022
+ "loss": 0.0066,
1023
+ "step": 3450
1024
+ },
1025
+ {
1026
+ "epoch": 6.36,
1027
+ "grad_norm": 0.02004937455058098,
1028
+ "learning_rate": 0.0003674676524953789,
1029
+ "loss": 0.0075,
1030
+ "step": 3475
1031
+ },
1032
+ {
1033
+ "epoch": 6.41,
1034
+ "grad_norm": 0.009357332251966,
1035
+ "learning_rate": 0.0003628465804066544,
1036
+ "loss": 0.0065,
1037
+ "step": 3500
1038
+ },
1039
+ {
1040
+ "epoch": 6.46,
1041
+ "grad_norm": 0.01666351594030857,
1042
+ "learning_rate": 0.0003582255083179298,
1043
+ "loss": 0.006,
1044
+ "step": 3525
1045
+ },
1046
+ {
1047
+ "epoch": 6.5,
1048
+ "grad_norm": 0.24134355783462524,
1049
+ "learning_rate": 0.0003536044362292052,
1050
+ "loss": 0.0088,
1051
+ "step": 3550
1052
+ },
1053
+ {
1054
+ "epoch": 6.55,
1055
+ "grad_norm": 0.14924415946006775,
1056
+ "learning_rate": 0.0003489833641404806,
1057
+ "loss": 0.007,
1058
+ "step": 3575
1059
+ },
1060
+ {
1061
+ "epoch": 6.59,
1062
+ "grad_norm": 0.12202003598213196,
1063
+ "learning_rate": 0.000344362292051756,
1064
+ "loss": 0.0101,
1065
+ "step": 3600
1066
+ },
1067
+ {
1068
+ "epoch": 6.64,
1069
+ "grad_norm": 0.0060227783396840096,
1070
+ "learning_rate": 0.0003397412199630314,
1071
+ "loss": 0.0117,
1072
+ "step": 3625
1073
+ },
1074
+ {
1075
+ "epoch": 6.68,
1076
+ "grad_norm": 0.3869228959083557,
1077
+ "learning_rate": 0.0003351201478743068,
1078
+ "loss": 0.0151,
1079
+ "step": 3650
1080
+ },
1081
+ {
1082
+ "epoch": 6.73,
1083
+ "grad_norm": 0.018938152119517326,
1084
+ "learning_rate": 0.00033049907578558223,
1085
+ "loss": 0.0076,
1086
+ "step": 3675
1087
+ },
1088
+ {
1089
+ "epoch": 6.78,
1090
+ "grad_norm": 0.018859045580029488,
1091
+ "learning_rate": 0.00032587800369685764,
1092
+ "loss": 0.0085,
1093
+ "step": 3700
1094
+ },
1095
+ {
1096
+ "epoch": 6.82,
1097
+ "grad_norm": 0.08804900199174881,
1098
+ "learning_rate": 0.0003212569316081331,
1099
+ "loss": 0.0097,
1100
+ "step": 3725
1101
+ },
1102
+ {
1103
+ "epoch": 6.87,
1104
+ "grad_norm": 0.3045863211154938,
1105
+ "learning_rate": 0.0003166358595194085,
1106
+ "loss": 0.0132,
1107
+ "step": 3750
1108
+ },
1109
+ {
1110
+ "epoch": 6.91,
1111
+ "grad_norm": 0.022158470004796982,
1112
+ "learning_rate": 0.0003120147874306839,
1113
+ "loss": 0.0124,
1114
+ "step": 3775
1115
+ },
1116
+ {
1117
+ "epoch": 6.96,
1118
+ "grad_norm": 0.15056921541690826,
1119
+ "learning_rate": 0.0003073937153419594,
1120
+ "loss": 0.0087,
1121
+ "step": 3800
1122
+ },
1123
+ {
1124
+ "epoch": 7.0,
1125
+ "eval_loss": 0.19078923761844635,
1126
+ "eval_runtime": 177.5513,
1127
+ "eval_samples_per_second": 4.579,
1128
+ "eval_steps_per_second": 0.766,
1129
+ "step": 3822
1130
+ },
1131
+ {
1132
+ "epoch": 7.01,
1133
+ "grad_norm": 0.04336291924118996,
1134
+ "learning_rate": 0.0003027726432532348,
1135
+ "loss": 0.0086,
1136
+ "step": 3825
1137
+ },
1138
+ {
1139
+ "epoch": 7.05,
1140
+ "grad_norm": 0.0327971875667572,
1141
+ "learning_rate": 0.0002981515711645102,
1142
+ "loss": 0.0084,
1143
+ "step": 3850
1144
+ },
1145
+ {
1146
+ "epoch": 7.1,
1147
+ "grad_norm": 0.0314444899559021,
1148
+ "learning_rate": 0.0002935304990757856,
1149
+ "loss": 0.0048,
1150
+ "step": 3875
1151
+ },
1152
+ {
1153
+ "epoch": 7.14,
1154
+ "grad_norm": 0.17276029288768768,
1155
+ "learning_rate": 0.000288909426987061,
1156
+ "loss": 0.007,
1157
+ "step": 3900
1158
+ },
1159
+ {
1160
+ "epoch": 7.19,
1161
+ "grad_norm": 0.18024314939975739,
1162
+ "learning_rate": 0.0002842883548983364,
1163
+ "loss": 0.0074,
1164
+ "step": 3925
1165
+ },
1166
+ {
1167
+ "epoch": 7.23,
1168
+ "grad_norm": 0.01734893210232258,
1169
+ "learning_rate": 0.0002796672828096118,
1170
+ "loss": 0.0071,
1171
+ "step": 3950
1172
+ },
1173
+ {
1174
+ "epoch": 7.28,
1175
+ "grad_norm": 0.01721636950969696,
1176
+ "learning_rate": 0.0002750462107208873,
1177
+ "loss": 0.0123,
1178
+ "step": 3975
1179
+ },
1180
+ {
1181
+ "epoch": 7.33,
1182
+ "grad_norm": 0.03225923702120781,
1183
+ "learning_rate": 0.0002704251386321627,
1184
+ "loss": 0.0061,
1185
+ "step": 4000
1186
+ },
1187
+ {
1188
+ "epoch": 7.37,
1189
+ "grad_norm": 0.10785706341266632,
1190
+ "learning_rate": 0.0002658040665434381,
1191
+ "loss": 0.0071,
1192
+ "step": 4025
1193
+ },
1194
+ {
1195
+ "epoch": 7.42,
1196
+ "grad_norm": 0.02195531316101551,
1197
+ "learning_rate": 0.0002611829944547135,
1198
+ "loss": 0.0067,
1199
+ "step": 4050
1200
+ },
1201
+ {
1202
+ "epoch": 7.46,
1203
+ "grad_norm": 0.025887854397296906,
1204
+ "learning_rate": 0.0002565619223659889,
1205
+ "loss": 0.0072,
1206
+ "step": 4075
1207
+ },
1208
+ {
1209
+ "epoch": 7.51,
1210
+ "grad_norm": 1.8573029041290283,
1211
+ "learning_rate": 0.0002519408502772643,
1212
+ "loss": 0.0044,
1213
+ "step": 4100
1214
+ },
1215
+ {
1216
+ "epoch": 7.55,
1217
+ "grad_norm": 0.41556769609451294,
1218
+ "learning_rate": 0.0002473197781885397,
1219
+ "loss": 0.0076,
1220
+ "step": 4125
1221
+ },
1222
+ {
1223
+ "epoch": 7.6,
1224
+ "grad_norm": 0.0036406666040420532,
1225
+ "learning_rate": 0.0002426987060998152,
1226
+ "loss": 0.0054,
1227
+ "step": 4150
1228
+ },
1229
+ {
1230
+ "epoch": 7.65,
1231
+ "grad_norm": 0.1950559765100479,
1232
+ "learning_rate": 0.0002380776340110906,
1233
+ "loss": 0.0052,
1234
+ "step": 4175
1235
+ },
1236
+ {
1237
+ "epoch": 7.69,
1238
+ "grad_norm": 0.01785474270582199,
1239
+ "learning_rate": 0.000233456561922366,
1240
+ "loss": 0.007,
1241
+ "step": 4200
1242
+ },
1243
+ {
1244
+ "epoch": 7.74,
1245
+ "grad_norm": 0.26933544874191284,
1246
+ "learning_rate": 0.0002288354898336414,
1247
+ "loss": 0.0048,
1248
+ "step": 4225
1249
+ },
1250
+ {
1251
+ "epoch": 7.78,
1252
+ "grad_norm": 0.19295917451381683,
1253
+ "learning_rate": 0.00022421441774491682,
1254
+ "loss": 0.0035,
1255
+ "step": 4250
1256
+ },
1257
+ {
1258
+ "epoch": 7.83,
1259
+ "grad_norm": 0.008535887114703655,
1260
+ "learning_rate": 0.00021959334565619225,
1261
+ "loss": 0.0063,
1262
+ "step": 4275
1263
+ },
1264
+ {
1265
+ "epoch": 7.88,
1266
+ "grad_norm": 0.16601914167404175,
1267
+ "learning_rate": 0.00021497227356746766,
1268
+ "loss": 0.0049,
1269
+ "step": 4300
1270
+ },
1271
+ {
1272
+ "epoch": 7.92,
1273
+ "grad_norm": 0.25450438261032104,
1274
+ "learning_rate": 0.00021035120147874306,
1275
+ "loss": 0.0069,
1276
+ "step": 4325
1277
+ },
1278
+ {
1279
+ "epoch": 7.97,
1280
+ "grad_norm": 0.049375709146261215,
1281
+ "learning_rate": 0.00020573012939001847,
1282
+ "loss": 0.0066,
1283
+ "step": 4350
1284
+ },
1285
+ {
1286
+ "epoch": 8.0,
1287
+ "eval_loss": 0.2085200548171997,
1288
+ "eval_runtime": 177.9722,
1289
+ "eval_samples_per_second": 4.568,
1290
+ "eval_steps_per_second": 0.764,
1291
+ "step": 4368
1292
+ },
1293
+ {
1294
+ "epoch": 8.01,
1295
+ "grad_norm": 0.06922808289527893,
1296
+ "learning_rate": 0.00020110905730129388,
1297
+ "loss": 0.0042,
1298
+ "step": 4375
1299
+ },
1300
+ {
1301
+ "epoch": 8.06,
1302
+ "grad_norm": 0.04170389473438263,
1303
+ "learning_rate": 0.00019648798521256934,
1304
+ "loss": 0.0037,
1305
+ "step": 4400
1306
+ },
1307
+ {
1308
+ "epoch": 8.1,
1309
+ "grad_norm": 0.010052547790110111,
1310
+ "learning_rate": 0.00019186691312384475,
1311
+ "loss": 0.0029,
1312
+ "step": 4425
1313
+ },
1314
+ {
1315
+ "epoch": 8.15,
1316
+ "grad_norm": 0.25184884667396545,
1317
+ "learning_rate": 0.00018724584103512016,
1318
+ "loss": 0.0039,
1319
+ "step": 4450
1320
+ },
1321
+ {
1322
+ "epoch": 8.2,
1323
+ "grad_norm": 0.07106045633554459,
1324
+ "learning_rate": 0.00018262476894639556,
1325
+ "loss": 0.0039,
1326
+ "step": 4475
1327
+ },
1328
+ {
1329
+ "epoch": 8.24,
1330
+ "grad_norm": 0.002000249456614256,
1331
+ "learning_rate": 0.00017800369685767097,
1332
+ "loss": 0.0056,
1333
+ "step": 4500
1334
+ },
1335
+ {
1336
+ "epoch": 8.29,
1337
+ "grad_norm": 0.025201383978128433,
1338
+ "learning_rate": 0.0001733826247689464,
1339
+ "loss": 0.0031,
1340
+ "step": 4525
1341
+ },
1342
+ {
1343
+ "epoch": 8.33,
1344
+ "grad_norm": 0.0007307173800654709,
1345
+ "learning_rate": 0.0001687615526802218,
1346
+ "loss": 0.0031,
1347
+ "step": 4550
1348
+ },
1349
+ {
1350
+ "epoch": 8.38,
1351
+ "grad_norm": 0.010259617120027542,
1352
+ "learning_rate": 0.00016414048059149722,
1353
+ "loss": 0.0036,
1354
+ "step": 4575
1355
+ },
1356
+ {
1357
+ "epoch": 8.42,
1358
+ "grad_norm": 0.004237270914018154,
1359
+ "learning_rate": 0.00015951940850277263,
1360
+ "loss": 0.0038,
1361
+ "step": 4600
1362
+ },
1363
+ {
1364
+ "epoch": 8.47,
1365
+ "grad_norm": 0.02443511225283146,
1366
+ "learning_rate": 0.0001548983364140481,
1367
+ "loss": 0.0044,
1368
+ "step": 4625
1369
+ },
1370
+ {
1371
+ "epoch": 8.52,
1372
+ "grad_norm": 0.039590246975421906,
1373
+ "learning_rate": 0.0001502772643253235,
1374
+ "loss": 0.0019,
1375
+ "step": 4650
1376
+ },
1377
+ {
1378
+ "epoch": 8.56,
1379
+ "grad_norm": 0.30276018381118774,
1380
+ "learning_rate": 0.0001456561922365989,
1381
+ "loss": 0.0023,
1382
+ "step": 4675
1383
+ },
1384
+ {
1385
+ "epoch": 8.61,
1386
+ "grad_norm": 0.05218060687184334,
1387
+ "learning_rate": 0.0001410351201478743,
1388
+ "loss": 0.003,
1389
+ "step": 4700
1390
+ },
1391
+ {
1392
+ "epoch": 8.65,
1393
+ "grad_norm": 0.02608703263103962,
1394
+ "learning_rate": 0.00013641404805914972,
1395
+ "loss": 0.0027,
1396
+ "step": 4725
1397
+ },
1398
+ {
1399
+ "epoch": 8.7,
1400
+ "grad_norm": 0.007796884514391422,
1401
+ "learning_rate": 0.00013179297597042515,
1402
+ "loss": 0.0038,
1403
+ "step": 4750
1404
+ },
1405
+ {
1406
+ "epoch": 8.75,
1407
+ "grad_norm": 0.008572472259402275,
1408
+ "learning_rate": 0.00012717190388170056,
1409
+ "loss": 0.0018,
1410
+ "step": 4775
1411
+ },
1412
+ {
1413
+ "epoch": 8.79,
1414
+ "grad_norm": 0.0034019711893051863,
1415
+ "learning_rate": 0.00012255083179297597,
1416
+ "loss": 0.003,
1417
+ "step": 4800
1418
+ },
1419
+ {
1420
+ "epoch": 8.84,
1421
+ "grad_norm": 0.003986136056482792,
1422
+ "learning_rate": 0.00011792975970425139,
1423
+ "loss": 0.0047,
1424
+ "step": 4825
1425
+ },
1426
+ {
1427
+ "epoch": 8.88,
1428
+ "grad_norm": 0.055789873003959656,
1429
+ "learning_rate": 0.00011330868761552681,
1430
+ "loss": 0.0021,
1431
+ "step": 4850
1432
+ },
1433
+ {
1434
+ "epoch": 8.93,
1435
+ "grad_norm": 0.07775359600782394,
1436
+ "learning_rate": 0.00010868761552680221,
1437
+ "loss": 0.0032,
1438
+ "step": 4875
1439
+ },
1440
+ {
1441
+ "epoch": 8.97,
1442
+ "grad_norm": 0.0017645555781200528,
1443
+ "learning_rate": 0.00010406654343807764,
1444
+ "loss": 0.0055,
1445
+ "step": 4900
1446
+ },
1447
+ {
1448
+ "epoch": 9.0,
1449
+ "eval_loss": 0.21004897356033325,
1450
+ "eval_runtime": 178.8955,
1451
+ "eval_samples_per_second": 4.545,
1452
+ "eval_steps_per_second": 0.76,
1453
+ "step": 4914
1454
+ },
1455
+ {
1456
+ "epoch": 9.02,
1457
+ "grad_norm": 0.22125497460365295,
1458
+ "learning_rate": 9.944547134935306e-05,
1459
+ "loss": 0.0031,
1460
+ "step": 4925
1461
+ },
1462
+ {
1463
+ "epoch": 9.07,
1464
+ "grad_norm": 0.003768475726246834,
1465
+ "learning_rate": 9.482439926062846e-05,
1466
+ "loss": 0.0013,
1467
+ "step": 4950
1468
+ },
1469
+ {
1470
+ "epoch": 9.11,
1471
+ "grad_norm": 0.013520549982786179,
1472
+ "learning_rate": 9.020332717190388e-05,
1473
+ "loss": 0.0025,
1474
+ "step": 4975
1475
+ },
1476
+ {
1477
+ "epoch": 9.16,
1478
+ "grad_norm": 0.009503871202468872,
1479
+ "learning_rate": 8.558225508317929e-05,
1480
+ "loss": 0.0024,
1481
+ "step": 5000
1482
+ },
1483
+ {
1484
+ "epoch": 9.2,
1485
+ "grad_norm": 0.0057460549287498,
1486
+ "learning_rate": 8.096118299445473e-05,
1487
+ "loss": 0.0015,
1488
+ "step": 5025
1489
+ },
1490
+ {
1491
+ "epoch": 9.25,
1492
+ "grad_norm": 0.06969017535448074,
1493
+ "learning_rate": 7.634011090573013e-05,
1494
+ "loss": 0.0017,
1495
+ "step": 5050
1496
+ },
1497
+ {
1498
+ "epoch": 9.29,
1499
+ "grad_norm": 0.1530989110469818,
1500
+ "learning_rate": 7.171903881700554e-05,
1501
+ "loss": 0.0022,
1502
+ "step": 5075
1503
+ },
1504
+ {
1505
+ "epoch": 9.34,
1506
+ "grad_norm": 0.1752089112997055,
1507
+ "learning_rate": 6.709796672828096e-05,
1508
+ "loss": 0.0018,
1509
+ "step": 5100
1510
+ },
1511
+ {
1512
+ "epoch": 9.39,
1513
+ "grad_norm": 0.023138588294386864,
1514
+ "learning_rate": 6.247689463955638e-05,
1515
+ "loss": 0.0014,
1516
+ "step": 5125
1517
+ },
1518
+ {
1519
+ "epoch": 9.43,
1520
+ "grad_norm": 0.005098209250718355,
1521
+ "learning_rate": 5.785582255083179e-05,
1522
+ "loss": 0.0012,
1523
+ "step": 5150
1524
+ },
1525
+ {
1526
+ "epoch": 9.48,
1527
+ "grad_norm": 0.007919879630208015,
1528
+ "learning_rate": 5.323475046210721e-05,
1529
+ "loss": 0.0023,
1530
+ "step": 5175
1531
+ },
1532
+ {
1533
+ "epoch": 9.52,
1534
+ "grad_norm": 0.0019298276165500283,
1535
+ "learning_rate": 4.8613678373382625e-05,
1536
+ "loss": 0.0015,
1537
+ "step": 5200
1538
+ },
1539
+ {
1540
+ "epoch": 9.57,
1541
+ "grad_norm": 0.0023822402581572533,
1542
+ "learning_rate": 4.3992606284658045e-05,
1543
+ "loss": 0.0011,
1544
+ "step": 5225
1545
+ },
1546
+ {
1547
+ "epoch": 9.62,
1548
+ "grad_norm": 0.03612617775797844,
1549
+ "learning_rate": 3.937153419593346e-05,
1550
+ "loss": 0.001,
1551
+ "step": 5250
1552
+ },
1553
+ {
1554
+ "epoch": 9.66,
1555
+ "grad_norm": 0.03683371841907501,
1556
+ "learning_rate": 3.4750462107208874e-05,
1557
+ "loss": 0.0016,
1558
+ "step": 5275
1559
+ },
1560
+ {
1561
+ "epoch": 9.71,
1562
+ "grad_norm": 0.04906224459409714,
1563
+ "learning_rate": 3.012939001848429e-05,
1564
+ "loss": 0.0022,
1565
+ "step": 5300
1566
+ },
1567
+ {
1568
+ "epoch": 9.75,
1569
+ "grad_norm": 0.08069704473018646,
1570
+ "learning_rate": 2.5508317929759705e-05,
1571
+ "loss": 0.0015,
1572
+ "step": 5325
1573
+ },
1574
+ {
1575
+ "epoch": 9.8,
1576
+ "grad_norm": 0.13353778421878815,
1577
+ "learning_rate": 2.088724584103512e-05,
1578
+ "loss": 0.0013,
1579
+ "step": 5350
1580
+ },
1581
+ {
1582
+ "epoch": 9.84,
1583
+ "grad_norm": 0.10152421146631241,
1584
+ "learning_rate": 1.6266173752310537e-05,
1585
+ "loss": 0.0015,
1586
+ "step": 5375
1587
+ },
1588
+ {
1589
+ "epoch": 9.89,
1590
+ "grad_norm": 0.010886043310165405,
1591
+ "learning_rate": 1.1645101663585952e-05,
1592
+ "loss": 0.0017,
1593
+ "step": 5400
1594
+ },
1595
+ {
1596
+ "epoch": 9.94,
1597
+ "grad_norm": 0.009057571180164814,
1598
+ "learning_rate": 7.024029574861368e-06,
1599
+ "loss": 0.0016,
1600
+ "step": 5425
1601
+ },
1602
+ {
1603
+ "epoch": 9.98,
1604
+ "grad_norm": 0.020738158375024796,
1605
+ "learning_rate": 2.402957486136784e-06,
1606
+ "loss": 0.0013,
1607
+ "step": 5450
1608
+ },
1609
+ {
1610
+ "epoch": 10.0,
1611
+ "eval_loss": 0.21373072266578674,
1612
+ "eval_runtime": 177.6097,
1613
+ "eval_samples_per_second": 4.577,
1614
+ "eval_steps_per_second": 0.766,
1615
+ "step": 5460
1616
+ },
1617
+ {
1618
+ "epoch": 10.0,
1619
+ "step": 5460,
1620
+ "total_flos": 9.7789895073792e+18,
1621
+ "train_loss": 0.05130936206342318,
1622
+ "train_runtime": 10728.949,
1623
+ "train_samples_per_second": 3.051,
1624
+ "train_steps_per_second": 0.509
1625
+ }
1626
+ ],
1627
+ "logging_steps": 25,
1628
+ "max_steps": 5460,
1629
+ "num_input_tokens_seen": 0,
1630
+ "num_train_epochs": 10,
1631
+ "save_steps": 500,
1632
+ "total_flos": 9.7789895073792e+18,
1633
+ "train_batch_size": 6,
1634
+ "trial_name": null,
1635
+ "trial_params": null
1636
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d450ec50cab4188af1d2f839c282d646159565242704493acf7a0046664a3f1
3
+ size 5112