RaphaelMourad commited on
Commit
fa64459
1 Parent(s): ea054ed

Upload 9 files

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../MistralModels/models/Mixtral-8x7B-v0.1-small-4096",
3
+ "architectures": [
4
+ "MixtralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 768,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "mixtral",
15
+ "num_attention_heads": 8,
16
+ "num_experts_per_tok": 1,
17
+ "num_hidden_layers": 8,
18
+ "num_key_value_heads": 8,
19
+ "num_local_experts": 8,
20
+ "output_router_logits": false,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_theta": 1000000.0,
23
+ "router_aux_loss_coef": 0.02,
24
+ "router_jitter_noise": 0.0,
25
+ "sliding_window": null,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.43.3",
29
+ "use_cache": true,
30
+ "vocab_size": 4096
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.43.3"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81c550de27b0cfb958bd90295e8081ba41021975958f42ab55709ffed1088c28
3
+ size 276979168
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f947d6b54eeedc56f985f58c76b5403af723450de37052dd484bc469768e1b
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c97752dc999f5f01bcf49874b8735b2a56080af04fd7b12e23eeed58185e97d8
3
+ size 1064
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PreTrainedTokenizerFast", "unk_token": "[UNK]", "cls_token": "[CLS]", "sep_token": "[SEP]", "pad_token": "[PAD]", "mask_token": "[MASK]"}
trainer_state.json ADDED
@@ -0,0 +1,1773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 6.787229061126709,
3
+ "best_model_checkpoint": "./results/models/checkpoint-106530",
4
+ "epoch": 30.0,
5
+ "eval_steps": 500,
6
+ "global_step": 106530,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.14080540692762603,
13
+ "grad_norm": 0.2255859375,
14
+ "learning_rate": 0.00398873556744579,
15
+ "loss": 6.8932,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.28161081385525205,
20
+ "grad_norm": 0.9609375,
21
+ "learning_rate": 0.0039774711348915795,
22
+ "loss": 6.8292,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.42241622078287805,
27
+ "grad_norm": 1.6015625,
28
+ "learning_rate": 0.00396620670233737,
29
+ "loss": 6.8237,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.5632216277105041,
34
+ "grad_norm": 2.75,
35
+ "learning_rate": 0.00395494226978316,
36
+ "loss": 6.8763,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.7040270346381301,
41
+ "grad_norm": 1.375,
42
+ "learning_rate": 0.00394367783722895,
43
+ "loss": 6.9046,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.8448324415657561,
48
+ "grad_norm": 1.1171875,
49
+ "learning_rate": 0.003932413404674739,
50
+ "loss": 6.8564,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.9856378484933821,
55
+ "grad_norm": 2.25,
56
+ "learning_rate": 0.003921148972120529,
57
+ "loss": 6.8481,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 1.0,
62
+ "eval_loss": 6.840273380279541,
63
+ "eval_runtime": 197.295,
64
+ "eval_samples_per_second": 10.137,
65
+ "eval_steps_per_second": 1.267,
66
+ "step": 3551
67
+ },
68
+ {
69
+ "epoch": 1.1264432554210082,
70
+ "grad_norm": 6.875,
71
+ "learning_rate": 0.003909884539566319,
72
+ "loss": 6.8419,
73
+ "step": 4000
74
+ },
75
+ {
76
+ "epoch": 1.267248662348634,
77
+ "grad_norm": 0.349609375,
78
+ "learning_rate": 0.003898620107012109,
79
+ "loss": 6.9106,
80
+ "step": 4500
81
+ },
82
+ {
83
+ "epoch": 1.4080540692762602,
84
+ "grad_norm": 1.96875,
85
+ "learning_rate": 0.003887355674457899,
86
+ "loss": 6.8742,
87
+ "step": 5000
88
+ },
89
+ {
90
+ "epoch": 1.5488594762038863,
91
+ "grad_norm": 1.734375,
92
+ "learning_rate": 0.0038760912419036893,
93
+ "loss": 6.8742,
94
+ "step": 5500
95
+ },
96
+ {
97
+ "epoch": 1.6896648831315122,
98
+ "grad_norm": 4.125,
99
+ "learning_rate": 0.003864826809349479,
100
+ "loss": 6.8928,
101
+ "step": 6000
102
+ },
103
+ {
104
+ "epoch": 1.8304702900591383,
105
+ "grad_norm": 2.125,
106
+ "learning_rate": 0.003853562376795269,
107
+ "loss": 6.8663,
108
+ "step": 6500
109
+ },
110
+ {
111
+ "epoch": 1.9712756969867642,
112
+ "grad_norm": 3.640625,
113
+ "learning_rate": 0.003842297944241059,
114
+ "loss": 6.8515,
115
+ "step": 7000
116
+ },
117
+ {
118
+ "epoch": 2.0,
119
+ "eval_loss": 6.85520601272583,
120
+ "eval_runtime": 192.2095,
121
+ "eval_samples_per_second": 10.405,
122
+ "eval_steps_per_second": 1.301,
123
+ "step": 7102
124
+ },
125
+ {
126
+ "epoch": 2.1120811039143903,
127
+ "grad_norm": 55.75,
128
+ "learning_rate": 0.0038310335116868485,
129
+ "loss": 6.8759,
130
+ "step": 7500
131
+ },
132
+ {
133
+ "epoch": 2.2528865108420164,
134
+ "grad_norm": 15.0,
135
+ "learning_rate": 0.0038197690791326386,
136
+ "loss": 6.9025,
137
+ "step": 8000
138
+ },
139
+ {
140
+ "epoch": 2.3936919177696425,
141
+ "grad_norm": 0.259765625,
142
+ "learning_rate": 0.0038085046465784287,
143
+ "loss": 6.9002,
144
+ "step": 8500
145
+ },
146
+ {
147
+ "epoch": 2.534497324697268,
148
+ "grad_norm": 0.42578125,
149
+ "learning_rate": 0.003797240214024219,
150
+ "loss": 6.8857,
151
+ "step": 9000
152
+ },
153
+ {
154
+ "epoch": 2.6753027316248943,
155
+ "grad_norm": 2.984375,
156
+ "learning_rate": 0.0037859757814700085,
157
+ "loss": 6.882,
158
+ "step": 9500
159
+ },
160
+ {
161
+ "epoch": 2.8161081385525204,
162
+ "grad_norm": 1.953125,
163
+ "learning_rate": 0.0037747113489157986,
164
+ "loss": 6.8677,
165
+ "step": 10000
166
+ },
167
+ {
168
+ "epoch": 2.9569135454801465,
169
+ "grad_norm": 1.484375,
170
+ "learning_rate": 0.0037634469163615883,
171
+ "loss": 6.8696,
172
+ "step": 10500
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_loss": 6.850285530090332,
177
+ "eval_runtime": 190.0725,
178
+ "eval_samples_per_second": 10.522,
179
+ "eval_steps_per_second": 1.315,
180
+ "step": 10653
181
+ },
182
+ {
183
+ "epoch": 3.0977189524077726,
184
+ "grad_norm": 0.94140625,
185
+ "learning_rate": 0.003752182483807378,
186
+ "loss": 6.8573,
187
+ "step": 11000
188
+ },
189
+ {
190
+ "epoch": 3.2385243593353983,
191
+ "grad_norm": 11.1875,
192
+ "learning_rate": 0.003740918051253168,
193
+ "loss": 6.8511,
194
+ "step": 11500
195
+ },
196
+ {
197
+ "epoch": 3.3793297662630244,
198
+ "grad_norm": 2.3125,
199
+ "learning_rate": 0.0037296536186989583,
200
+ "loss": 6.8586,
201
+ "step": 12000
202
+ },
203
+ {
204
+ "epoch": 3.5201351731906505,
205
+ "grad_norm": 134.0,
206
+ "learning_rate": 0.003718389186144748,
207
+ "loss": 6.8683,
208
+ "step": 12500
209
+ },
210
+ {
211
+ "epoch": 3.6609405801182766,
212
+ "grad_norm": 3.65625,
213
+ "learning_rate": 0.003707124753590538,
214
+ "loss": 6.8525,
215
+ "step": 13000
216
+ },
217
+ {
218
+ "epoch": 3.8017459870459027,
219
+ "grad_norm": 1.78125,
220
+ "learning_rate": 0.0036958603210363278,
221
+ "loss": 6.8547,
222
+ "step": 13500
223
+ },
224
+ {
225
+ "epoch": 3.9425513939735284,
226
+ "grad_norm": 14.1875,
227
+ "learning_rate": 0.0036845958884821174,
228
+ "loss": 6.8456,
229
+ "step": 14000
230
+ },
231
+ {
232
+ "epoch": 4.0,
233
+ "eval_loss": 6.838648319244385,
234
+ "eval_runtime": 187.9085,
235
+ "eval_samples_per_second": 10.643,
236
+ "eval_steps_per_second": 1.33,
237
+ "step": 14204
238
+ },
239
+ {
240
+ "epoch": 4.0833568009011545,
241
+ "grad_norm": 1.5078125,
242
+ "learning_rate": 0.0036733314559279076,
243
+ "loss": 6.8422,
244
+ "step": 14500
245
+ },
246
+ {
247
+ "epoch": 4.224162207828781,
248
+ "grad_norm": 2.484375,
249
+ "learning_rate": 0.0036620670233736977,
250
+ "loss": 6.859,
251
+ "step": 15000
252
+ },
253
+ {
254
+ "epoch": 4.364967614756407,
255
+ "grad_norm": 6.65625,
256
+ "learning_rate": 0.003650802590819488,
257
+ "loss": 6.8541,
258
+ "step": 15500
259
+ },
260
+ {
261
+ "epoch": 4.505773021684033,
262
+ "grad_norm": 6.59375,
263
+ "learning_rate": 0.0036395381582652775,
264
+ "loss": 6.8385,
265
+ "step": 16000
266
+ },
267
+ {
268
+ "epoch": 4.646578428611659,
269
+ "grad_norm": 3.546875,
270
+ "learning_rate": 0.003628273725711067,
271
+ "loss": 6.8464,
272
+ "step": 16500
273
+ },
274
+ {
275
+ "epoch": 4.787383835539285,
276
+ "grad_norm": 1.625,
277
+ "learning_rate": 0.0036170092931568573,
278
+ "loss": 6.8434,
279
+ "step": 17000
280
+ },
281
+ {
282
+ "epoch": 4.92818924246691,
283
+ "grad_norm": 16.25,
284
+ "learning_rate": 0.003605744860602647,
285
+ "loss": 6.8525,
286
+ "step": 17500
287
+ },
288
+ {
289
+ "epoch": 5.0,
290
+ "eval_loss": 6.835108280181885,
291
+ "eval_runtime": 207.3832,
292
+ "eval_samples_per_second": 9.644,
293
+ "eval_steps_per_second": 1.205,
294
+ "step": 17755
295
+ },
296
+ {
297
+ "epoch": 5.068994649394536,
298
+ "grad_norm": 2.5625,
299
+ "learning_rate": 0.003594480428048437,
300
+ "loss": 6.8404,
301
+ "step": 18000
302
+ },
303
+ {
304
+ "epoch": 5.2098000563221625,
305
+ "grad_norm": 4.8125,
306
+ "learning_rate": 0.0035832159954942273,
307
+ "loss": 6.8484,
308
+ "step": 18500
309
+ },
310
+ {
311
+ "epoch": 5.350605463249789,
312
+ "grad_norm": 3.8125,
313
+ "learning_rate": 0.003571951562940017,
314
+ "loss": 6.8502,
315
+ "step": 19000
316
+ },
317
+ {
318
+ "epoch": 5.491410870177415,
319
+ "grad_norm": 2.34375,
320
+ "learning_rate": 0.0035606871303858066,
321
+ "loss": 6.8384,
322
+ "step": 19500
323
+ },
324
+ {
325
+ "epoch": 5.632216277105041,
326
+ "grad_norm": 1.96875,
327
+ "learning_rate": 0.0035494226978315968,
328
+ "loss": 6.8478,
329
+ "step": 20000
330
+ },
331
+ {
332
+ "epoch": 5.773021684032667,
333
+ "grad_norm": 3.375,
334
+ "learning_rate": 0.003538158265277387,
335
+ "loss": 6.8339,
336
+ "step": 20500
337
+ },
338
+ {
339
+ "epoch": 5.913827090960293,
340
+ "grad_norm": 4.65625,
341
+ "learning_rate": 0.0035268938327231766,
342
+ "loss": 6.843,
343
+ "step": 21000
344
+ },
345
+ {
346
+ "epoch": 6.0,
347
+ "eval_loss": 6.82509708404541,
348
+ "eval_runtime": 199.4174,
349
+ "eval_samples_per_second": 10.029,
350
+ "eval_steps_per_second": 1.254,
351
+ "step": 21306
352
+ },
353
+ {
354
+ "epoch": 6.054632497887919,
355
+ "grad_norm": 3.171875,
356
+ "learning_rate": 0.0035156294001689667,
357
+ "loss": 6.8405,
358
+ "step": 21500
359
+ },
360
+ {
361
+ "epoch": 6.195437904815545,
362
+ "grad_norm": 10.9375,
363
+ "learning_rate": 0.003504364967614757,
364
+ "loss": 6.8416,
365
+ "step": 22000
366
+ },
367
+ {
368
+ "epoch": 6.336243311743171,
369
+ "grad_norm": 7.3125,
370
+ "learning_rate": 0.003493100535060546,
371
+ "loss": 6.8379,
372
+ "step": 22500
373
+ },
374
+ {
375
+ "epoch": 6.477048718670797,
376
+ "grad_norm": 4.65625,
377
+ "learning_rate": 0.003481836102506336,
378
+ "loss": 6.8452,
379
+ "step": 23000
380
+ },
381
+ {
382
+ "epoch": 6.617854125598423,
383
+ "grad_norm": 1.7734375,
384
+ "learning_rate": 0.0034705716699521263,
385
+ "loss": 6.8463,
386
+ "step": 23500
387
+ },
388
+ {
389
+ "epoch": 6.758659532526049,
390
+ "grad_norm": 11.25,
391
+ "learning_rate": 0.003459307237397916,
392
+ "loss": 6.8358,
393
+ "step": 24000
394
+ },
395
+ {
396
+ "epoch": 6.899464939453675,
397
+ "grad_norm": 1.7421875,
398
+ "learning_rate": 0.003448042804843706,
399
+ "loss": 6.8361,
400
+ "step": 24500
401
+ },
402
+ {
403
+ "epoch": 7.0,
404
+ "eval_loss": 6.824986457824707,
405
+ "eval_runtime": 181.2581,
406
+ "eval_samples_per_second": 11.034,
407
+ "eval_steps_per_second": 1.379,
408
+ "step": 24857
409
+ },
410
+ {
411
+ "epoch": 7.040270346381301,
412
+ "grad_norm": 3.84375,
413
+ "learning_rate": 0.0034367783722894962,
414
+ "loss": 6.8328,
415
+ "step": 25000
416
+ },
417
+ {
418
+ "epoch": 7.181075753308927,
419
+ "grad_norm": 1.4296875,
420
+ "learning_rate": 0.0034255139397352855,
421
+ "loss": 6.8272,
422
+ "step": 25500
423
+ },
424
+ {
425
+ "epoch": 7.321881160236553,
426
+ "grad_norm": 3.359375,
427
+ "learning_rate": 0.0034142495071810756,
428
+ "loss": 6.8164,
429
+ "step": 26000
430
+ },
431
+ {
432
+ "epoch": 7.462686567164179,
433
+ "grad_norm": 8.875,
434
+ "learning_rate": 0.0034029850746268657,
435
+ "loss": 6.8276,
436
+ "step": 26500
437
+ },
438
+ {
439
+ "epoch": 7.603491974091805,
440
+ "grad_norm": 2.078125,
441
+ "learning_rate": 0.003391720642072656,
442
+ "loss": 6.8248,
443
+ "step": 27000
444
+ },
445
+ {
446
+ "epoch": 7.744297381019431,
447
+ "grad_norm": 8.8125,
448
+ "learning_rate": 0.0033804562095184456,
449
+ "loss": 6.8188,
450
+ "step": 27500
451
+ },
452
+ {
453
+ "epoch": 7.885102787947057,
454
+ "grad_norm": 3.921875,
455
+ "learning_rate": 0.0033691917769642357,
456
+ "loss": 6.8232,
457
+ "step": 28000
458
+ },
459
+ {
460
+ "epoch": 8.0,
461
+ "eval_loss": 6.8248701095581055,
462
+ "eval_runtime": 170.7503,
463
+ "eval_samples_per_second": 11.713,
464
+ "eval_steps_per_second": 1.464,
465
+ "step": 28408
466
+ },
467
+ {
468
+ "epoch": 8.025908194874683,
469
+ "grad_norm": 5.9375,
470
+ "learning_rate": 0.0033579273444100254,
471
+ "loss": 6.8241,
472
+ "step": 28500
473
+ },
474
+ {
475
+ "epoch": 8.166713601802309,
476
+ "grad_norm": 5.8125,
477
+ "learning_rate": 0.003346662911855815,
478
+ "loss": 6.8311,
479
+ "step": 29000
480
+ },
481
+ {
482
+ "epoch": 8.307519008729935,
483
+ "grad_norm": 3.328125,
484
+ "learning_rate": 0.003335398479301605,
485
+ "loss": 6.8203,
486
+ "step": 29500
487
+ },
488
+ {
489
+ "epoch": 8.448324415657561,
490
+ "grad_norm": 7.46875,
491
+ "learning_rate": 0.0033241340467473953,
492
+ "loss": 6.8291,
493
+ "step": 30000
494
+ },
495
+ {
496
+ "epoch": 8.589129822585187,
497
+ "grad_norm": 4.59375,
498
+ "learning_rate": 0.003312869614193185,
499
+ "loss": 6.8261,
500
+ "step": 30500
501
+ },
502
+ {
503
+ "epoch": 8.729935229512813,
504
+ "grad_norm": 4.3125,
505
+ "learning_rate": 0.003301605181638975,
506
+ "loss": 6.8362,
507
+ "step": 31000
508
+ },
509
+ {
510
+ "epoch": 8.87074063644044,
511
+ "grad_norm": 11.25,
512
+ "learning_rate": 0.003290340749084765,
513
+ "loss": 6.8353,
514
+ "step": 31500
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "eval_loss": 6.821832656860352,
519
+ "eval_runtime": 210.9202,
520
+ "eval_samples_per_second": 9.482,
521
+ "eval_steps_per_second": 1.185,
522
+ "step": 31959
523
+ },
524
+ {
525
+ "epoch": 9.011546043368066,
526
+ "grad_norm": 4.46875,
527
+ "learning_rate": 0.003279076316530555,
528
+ "loss": 6.832,
529
+ "step": 32000
530
+ },
531
+ {
532
+ "epoch": 9.152351450295692,
533
+ "grad_norm": 4.0625,
534
+ "learning_rate": 0.0032678118839763446,
535
+ "loss": 6.8292,
536
+ "step": 32500
537
+ },
538
+ {
539
+ "epoch": 9.293156857223318,
540
+ "grad_norm": 1.984375,
541
+ "learning_rate": 0.0032565474514221347,
542
+ "loss": 6.8248,
543
+ "step": 33000
544
+ },
545
+ {
546
+ "epoch": 9.433962264150944,
547
+ "grad_norm": 43.25,
548
+ "learning_rate": 0.003245283018867925,
549
+ "loss": 6.8277,
550
+ "step": 33500
551
+ },
552
+ {
553
+ "epoch": 9.57476767107857,
554
+ "grad_norm": 1.7890625,
555
+ "learning_rate": 0.0032340185863137146,
556
+ "loss": 6.8229,
557
+ "step": 34000
558
+ },
559
+ {
560
+ "epoch": 9.715573078006196,
561
+ "grad_norm": 3.296875,
562
+ "learning_rate": 0.0032227541537595042,
563
+ "loss": 6.8194,
564
+ "step": 34500
565
+ },
566
+ {
567
+ "epoch": 9.85637848493382,
568
+ "grad_norm": 3.390625,
569
+ "learning_rate": 0.0032114897212052944,
570
+ "loss": 6.8225,
571
+ "step": 35000
572
+ },
573
+ {
574
+ "epoch": 9.997183891861447,
575
+ "grad_norm": 22.375,
576
+ "learning_rate": 0.003200225288651084,
577
+ "loss": 6.8291,
578
+ "step": 35500
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_loss": 6.81277322769165,
583
+ "eval_runtime": 239.1719,
584
+ "eval_samples_per_second": 8.362,
585
+ "eval_steps_per_second": 1.045,
586
+ "step": 35510
587
+ },
588
+ {
589
+ "epoch": 10.137989298789073,
590
+ "grad_norm": 5.375,
591
+ "learning_rate": 0.003188960856096874,
592
+ "loss": 6.815,
593
+ "step": 36000
594
+ },
595
+ {
596
+ "epoch": 10.278794705716699,
597
+ "grad_norm": 5.25,
598
+ "learning_rate": 0.0031776964235426643,
599
+ "loss": 6.8173,
600
+ "step": 36500
601
+ },
602
+ {
603
+ "epoch": 10.419600112644325,
604
+ "grad_norm": 3.84375,
605
+ "learning_rate": 0.003166431990988454,
606
+ "loss": 6.8189,
607
+ "step": 37000
608
+ },
609
+ {
610
+ "epoch": 10.560405519571951,
611
+ "grad_norm": 1.1875,
612
+ "learning_rate": 0.0031551675584342437,
613
+ "loss": 6.828,
614
+ "step": 37500
615
+ },
616
+ {
617
+ "epoch": 10.701210926499577,
618
+ "grad_norm": 5.96875,
619
+ "learning_rate": 0.003143903125880034,
620
+ "loss": 6.8141,
621
+ "step": 38000
622
+ },
623
+ {
624
+ "epoch": 10.842016333427203,
625
+ "grad_norm": 117.0,
626
+ "learning_rate": 0.003132638693325824,
627
+ "loss": 6.8258,
628
+ "step": 38500
629
+ },
630
+ {
631
+ "epoch": 10.98282174035483,
632
+ "grad_norm": 3.40625,
633
+ "learning_rate": 0.0031213742607716136,
634
+ "loss": 6.8254,
635
+ "step": 39000
636
+ },
637
+ {
638
+ "epoch": 11.0,
639
+ "eval_loss": 6.814772129058838,
640
+ "eval_runtime": 228.9281,
641
+ "eval_samples_per_second": 8.736,
642
+ "eval_steps_per_second": 1.092,
643
+ "step": 39061
644
+ },
645
+ {
646
+ "epoch": 11.123627147282455,
647
+ "grad_norm": 1.578125,
648
+ "learning_rate": 0.0031101098282174037,
649
+ "loss": 6.8192,
650
+ "step": 39500
651
+ },
652
+ {
653
+ "epoch": 11.264432554210082,
654
+ "grad_norm": 2.6875,
655
+ "learning_rate": 0.003098845395663194,
656
+ "loss": 6.8217,
657
+ "step": 40000
658
+ },
659
+ {
660
+ "epoch": 11.405237961137708,
661
+ "grad_norm": 35.0,
662
+ "learning_rate": 0.003087580963108983,
663
+ "loss": 6.8172,
664
+ "step": 40500
665
+ },
666
+ {
667
+ "epoch": 11.546043368065334,
668
+ "grad_norm": 9.5625,
669
+ "learning_rate": 0.0030763165305547732,
670
+ "loss": 6.8307,
671
+ "step": 41000
672
+ },
673
+ {
674
+ "epoch": 11.68684877499296,
675
+ "grad_norm": 9.3125,
676
+ "learning_rate": 0.0030650520980005634,
677
+ "loss": 6.8237,
678
+ "step": 41500
679
+ },
680
+ {
681
+ "epoch": 11.827654181920586,
682
+ "grad_norm": 5.3125,
683
+ "learning_rate": 0.003053787665446353,
684
+ "loss": 6.8306,
685
+ "step": 42000
686
+ },
687
+ {
688
+ "epoch": 11.968459588848212,
689
+ "grad_norm": 5.34375,
690
+ "learning_rate": 0.003042523232892143,
691
+ "loss": 6.8138,
692
+ "step": 42500
693
+ },
694
+ {
695
+ "epoch": 12.0,
696
+ "eval_loss": 6.806704044342041,
697
+ "eval_runtime": 228.3922,
698
+ "eval_samples_per_second": 8.757,
699
+ "eval_steps_per_second": 1.095,
700
+ "step": 42612
701
+ },
702
+ {
703
+ "epoch": 12.109264995775838,
704
+ "grad_norm": 1.9296875,
705
+ "learning_rate": 0.0030312588003379333,
706
+ "loss": 6.8157,
707
+ "step": 43000
708
+ },
709
+ {
710
+ "epoch": 12.250070402703464,
711
+ "grad_norm": 5.25,
712
+ "learning_rate": 0.003019994367783723,
713
+ "loss": 6.8189,
714
+ "step": 43500
715
+ },
716
+ {
717
+ "epoch": 12.39087580963109,
718
+ "grad_norm": 3.078125,
719
+ "learning_rate": 0.0030087299352295127,
720
+ "loss": 6.8276,
721
+ "step": 44000
722
+ },
723
+ {
724
+ "epoch": 12.531681216558717,
725
+ "grad_norm": 3.46875,
726
+ "learning_rate": 0.002997465502675303,
727
+ "loss": 6.8092,
728
+ "step": 44500
729
+ },
730
+ {
731
+ "epoch": 12.672486623486343,
732
+ "grad_norm": 1.2421875,
733
+ "learning_rate": 0.002986201070121093,
734
+ "loss": 6.8152,
735
+ "step": 45000
736
+ },
737
+ {
738
+ "epoch": 12.813292030413969,
739
+ "grad_norm": 3.078125,
740
+ "learning_rate": 0.0029749366375668826,
741
+ "loss": 6.8141,
742
+ "step": 45500
743
+ },
744
+ {
745
+ "epoch": 12.954097437341593,
746
+ "grad_norm": 2.90625,
747
+ "learning_rate": 0.0029636722050126727,
748
+ "loss": 6.8209,
749
+ "step": 46000
750
+ },
751
+ {
752
+ "epoch": 13.0,
753
+ "eval_loss": 6.805023670196533,
754
+ "eval_runtime": 230.3566,
755
+ "eval_samples_per_second": 8.682,
756
+ "eval_steps_per_second": 1.085,
757
+ "step": 46163
758
+ },
759
+ {
760
+ "epoch": 13.09490284426922,
761
+ "grad_norm": 2.90625,
762
+ "learning_rate": 0.0029524077724584624,
763
+ "loss": 6.8179,
764
+ "step": 46500
765
+ },
766
+ {
767
+ "epoch": 13.235708251196845,
768
+ "grad_norm": 6.0,
769
+ "learning_rate": 0.002941143339904252,
770
+ "loss": 6.8193,
771
+ "step": 47000
772
+ },
773
+ {
774
+ "epoch": 13.376513658124471,
775
+ "grad_norm": 4.03125,
776
+ "learning_rate": 0.0029298789073500422,
777
+ "loss": 6.8153,
778
+ "step": 47500
779
+ },
780
+ {
781
+ "epoch": 13.517319065052098,
782
+ "grad_norm": 3.65625,
783
+ "learning_rate": 0.0029186144747958324,
784
+ "loss": 6.8046,
785
+ "step": 48000
786
+ },
787
+ {
788
+ "epoch": 13.658124471979724,
789
+ "grad_norm": 52.0,
790
+ "learning_rate": 0.002907350042241622,
791
+ "loss": 6.8161,
792
+ "step": 48500
793
+ },
794
+ {
795
+ "epoch": 13.79892987890735,
796
+ "grad_norm": 16.625,
797
+ "learning_rate": 0.002896085609687412,
798
+ "loss": 6.8094,
799
+ "step": 49000
800
+ },
801
+ {
802
+ "epoch": 13.939735285834976,
803
+ "grad_norm": 1.6328125,
804
+ "learning_rate": 0.002884821177133202,
805
+ "loss": 6.8181,
806
+ "step": 49500
807
+ },
808
+ {
809
+ "epoch": 14.0,
810
+ "eval_loss": 6.8096513748168945,
811
+ "eval_runtime": 235.2835,
812
+ "eval_samples_per_second": 8.5,
813
+ "eval_steps_per_second": 1.063,
814
+ "step": 49714
815
+ },
816
+ {
817
+ "epoch": 14.080540692762602,
818
+ "grad_norm": 1.140625,
819
+ "learning_rate": 0.002873556744578992,
820
+ "loss": 6.8197,
821
+ "step": 50000
822
+ },
823
+ {
824
+ "epoch": 14.221346099690228,
825
+ "grad_norm": 1.265625,
826
+ "learning_rate": 0.0028622923120247817,
827
+ "loss": 6.8097,
828
+ "step": 50500
829
+ },
830
+ {
831
+ "epoch": 14.362151506617854,
832
+ "grad_norm": 4.75,
833
+ "learning_rate": 0.002851027879470572,
834
+ "loss": 6.82,
835
+ "step": 51000
836
+ },
837
+ {
838
+ "epoch": 14.50295691354548,
839
+ "grad_norm": 2.28125,
840
+ "learning_rate": 0.002839763446916362,
841
+ "loss": 6.8203,
842
+ "step": 51500
843
+ },
844
+ {
845
+ "epoch": 14.643762320473106,
846
+ "grad_norm": 5.125,
847
+ "learning_rate": 0.0028284990143621516,
848
+ "loss": 6.807,
849
+ "step": 52000
850
+ },
851
+ {
852
+ "epoch": 14.784567727400733,
853
+ "grad_norm": 2.046875,
854
+ "learning_rate": 0.0028172345818079413,
855
+ "loss": 6.8153,
856
+ "step": 52500
857
+ },
858
+ {
859
+ "epoch": 14.925373134328359,
860
+ "grad_norm": 8.0,
861
+ "learning_rate": 0.0028059701492537314,
862
+ "loss": 6.8102,
863
+ "step": 53000
864
+ },
865
+ {
866
+ "epoch": 15.0,
867
+ "eval_loss": 6.802714824676514,
868
+ "eval_runtime": 235.2095,
869
+ "eval_samples_per_second": 8.503,
870
+ "eval_steps_per_second": 1.063,
871
+ "step": 53265
872
+ },
873
+ {
874
+ "epoch": 15.066178541255985,
875
+ "grad_norm": 16.5,
876
+ "learning_rate": 0.002794705716699521,
877
+ "loss": 6.8108,
878
+ "step": 53500
879
+ },
880
+ {
881
+ "epoch": 15.20698394818361,
882
+ "grad_norm": 2.015625,
883
+ "learning_rate": 0.0027834412841453112,
884
+ "loss": 6.8123,
885
+ "step": 54000
886
+ },
887
+ {
888
+ "epoch": 15.347789355111237,
889
+ "grad_norm": 2.90625,
890
+ "learning_rate": 0.0027721768515911013,
891
+ "loss": 6.8105,
892
+ "step": 54500
893
+ },
894
+ {
895
+ "epoch": 15.488594762038863,
896
+ "grad_norm": 8.375,
897
+ "learning_rate": 0.0027609124190368915,
898
+ "loss": 6.8163,
899
+ "step": 55000
900
+ },
901
+ {
902
+ "epoch": 15.629400168966487,
903
+ "grad_norm": 3.34375,
904
+ "learning_rate": 0.0027496479864826807,
905
+ "loss": 6.8062,
906
+ "step": 55500
907
+ },
908
+ {
909
+ "epoch": 15.770205575894114,
910
+ "grad_norm": 48.25,
911
+ "learning_rate": 0.002738383553928471,
912
+ "loss": 6.8065,
913
+ "step": 56000
914
+ },
915
+ {
916
+ "epoch": 15.91101098282174,
917
+ "grad_norm": 2.9375,
918
+ "learning_rate": 0.002727119121374261,
919
+ "loss": 6.8066,
920
+ "step": 56500
921
+ },
922
+ {
923
+ "epoch": 16.0,
924
+ "eval_loss": 6.799798011779785,
925
+ "eval_runtime": 204.1083,
926
+ "eval_samples_per_second": 9.799,
927
+ "eval_steps_per_second": 1.225,
928
+ "step": 56816
929
+ },
930
+ {
931
+ "epoch": 16.051816389749366,
932
+ "grad_norm": 136.0,
933
+ "learning_rate": 0.0027158546888200507,
934
+ "loss": 6.8064,
935
+ "step": 57000
936
+ },
937
+ {
938
+ "epoch": 16.192621796676992,
939
+ "grad_norm": 2.75,
940
+ "learning_rate": 0.002704590256265841,
941
+ "loss": 6.8083,
942
+ "step": 57500
943
+ },
944
+ {
945
+ "epoch": 16.333427203604618,
946
+ "grad_norm": 15.625,
947
+ "learning_rate": 0.002693325823711631,
948
+ "loss": 6.8038,
949
+ "step": 58000
950
+ },
951
+ {
952
+ "epoch": 16.474232610532244,
953
+ "grad_norm": 20.875,
954
+ "learning_rate": 0.00268206139115742,
955
+ "loss": 6.802,
956
+ "step": 58500
957
+ },
958
+ {
959
+ "epoch": 16.61503801745987,
960
+ "grad_norm": 15.3125,
961
+ "learning_rate": 0.0026707969586032103,
962
+ "loss": 6.8112,
963
+ "step": 59000
964
+ },
965
+ {
966
+ "epoch": 16.755843424387496,
967
+ "grad_norm": 8.1875,
968
+ "learning_rate": 0.0026595325260490004,
969
+ "loss": 6.8144,
970
+ "step": 59500
971
+ },
972
+ {
973
+ "epoch": 16.896648831315122,
974
+ "grad_norm": 5.03125,
975
+ "learning_rate": 0.0026482680934947905,
976
+ "loss": 6.8158,
977
+ "step": 60000
978
+ },
979
+ {
980
+ "epoch": 17.0,
981
+ "eval_loss": 6.8019022941589355,
982
+ "eval_runtime": 194.9738,
983
+ "eval_samples_per_second": 10.258,
984
+ "eval_steps_per_second": 1.282,
985
+ "step": 60367
986
+ },
987
+ {
988
+ "epoch": 17.03745423824275,
989
+ "grad_norm": 5.34375,
990
+ "learning_rate": 0.0026370036609405802,
991
+ "loss": 6.8102,
992
+ "step": 60500
993
+ },
994
+ {
995
+ "epoch": 17.178259645170375,
996
+ "grad_norm": 26.125,
997
+ "learning_rate": 0.0026257392283863703,
998
+ "loss": 6.8135,
999
+ "step": 61000
1000
+ },
1001
+ {
1002
+ "epoch": 17.319065052098,
1003
+ "grad_norm": 6.71875,
1004
+ "learning_rate": 0.00261447479583216,
1005
+ "loss": 6.8102,
1006
+ "step": 61500
1007
+ },
1008
+ {
1009
+ "epoch": 17.459870459025627,
1010
+ "grad_norm": 30.875,
1011
+ "learning_rate": 0.0026032103632779497,
1012
+ "loss": 6.8099,
1013
+ "step": 62000
1014
+ },
1015
+ {
1016
+ "epoch": 17.600675865953253,
1017
+ "grad_norm": 78.0,
1018
+ "learning_rate": 0.00259194593072374,
1019
+ "loss": 6.8046,
1020
+ "step": 62500
1021
+ },
1022
+ {
1023
+ "epoch": 17.74148127288088,
1024
+ "grad_norm": 14.625,
1025
+ "learning_rate": 0.00258068149816953,
1026
+ "loss": 6.809,
1027
+ "step": 63000
1028
+ },
1029
+ {
1030
+ "epoch": 17.882286679808505,
1031
+ "grad_norm": 170.0,
1032
+ "learning_rate": 0.0025694170656153197,
1033
+ "loss": 6.8035,
1034
+ "step": 63500
1035
+ },
1036
+ {
1037
+ "epoch": 18.0,
1038
+ "eval_loss": 6.801079273223877,
1039
+ "eval_runtime": 232.8698,
1040
+ "eval_samples_per_second": 8.588,
1041
+ "eval_steps_per_second": 1.074,
1042
+ "step": 63918
1043
+ },
1044
+ {
1045
+ "epoch": 18.02309208673613,
1046
+ "grad_norm": 2.296875,
1047
+ "learning_rate": 0.0025581526330611098,
1048
+ "loss": 6.8134,
1049
+ "step": 64000
1050
+ },
1051
+ {
1052
+ "epoch": 18.163897493663757,
1053
+ "grad_norm": 87.0,
1054
+ "learning_rate": 0.0025468882005068995,
1055
+ "loss": 6.8024,
1056
+ "step": 64500
1057
+ },
1058
+ {
1059
+ "epoch": 18.304702900591383,
1060
+ "grad_norm": 1.3828125,
1061
+ "learning_rate": 0.002535623767952689,
1062
+ "loss": 6.8068,
1063
+ "step": 65000
1064
+ },
1065
+ {
1066
+ "epoch": 18.44550830751901,
1067
+ "grad_norm": 22.5,
1068
+ "learning_rate": 0.0025243593353984793,
1069
+ "loss": 6.8083,
1070
+ "step": 65500
1071
+ },
1072
+ {
1073
+ "epoch": 18.586313714446636,
1074
+ "grad_norm": 3.578125,
1075
+ "learning_rate": 0.0025130949028442694,
1076
+ "loss": 6.8071,
1077
+ "step": 66000
1078
+ },
1079
+ {
1080
+ "epoch": 18.727119121374262,
1081
+ "grad_norm": 22.125,
1082
+ "learning_rate": 0.0025018304702900595,
1083
+ "loss": 6.8059,
1084
+ "step": 66500
1085
+ },
1086
+ {
1087
+ "epoch": 18.867924528301888,
1088
+ "grad_norm": 16.75,
1089
+ "learning_rate": 0.002490566037735849,
1090
+ "loss": 6.8056,
1091
+ "step": 67000
1092
+ },
1093
+ {
1094
+ "epoch": 19.0,
1095
+ "eval_loss": 6.797260761260986,
1096
+ "eval_runtime": 216.6541,
1097
+ "eval_samples_per_second": 9.231,
1098
+ "eval_steps_per_second": 1.154,
1099
+ "step": 67469
1100
+ },
1101
+ {
1102
+ "epoch": 19.008729935229514,
1103
+ "grad_norm": 4.625,
1104
+ "learning_rate": 0.002479301605181639,
1105
+ "loss": 6.8015,
1106
+ "step": 67500
1107
+ },
1108
+ {
1109
+ "epoch": 19.14953534215714,
1110
+ "grad_norm": 2.734375,
1111
+ "learning_rate": 0.002468037172627429,
1112
+ "loss": 6.8118,
1113
+ "step": 68000
1114
+ },
1115
+ {
1116
+ "epoch": 19.290340749084766,
1117
+ "grad_norm": 45.0,
1118
+ "learning_rate": 0.0024567727400732187,
1119
+ "loss": 6.7995,
1120
+ "step": 68500
1121
+ },
1122
+ {
1123
+ "epoch": 19.431146156012392,
1124
+ "grad_norm": 46.0,
1125
+ "learning_rate": 0.002445508307519009,
1126
+ "loss": 6.8098,
1127
+ "step": 69000
1128
+ },
1129
+ {
1130
+ "epoch": 19.57195156294002,
1131
+ "grad_norm": 7.09375,
1132
+ "learning_rate": 0.002434243874964799,
1133
+ "loss": 6.7972,
1134
+ "step": 69500
1135
+ },
1136
+ {
1137
+ "epoch": 19.712756969867645,
1138
+ "grad_norm": 91.5,
1139
+ "learning_rate": 0.002422979442410588,
1140
+ "loss": 6.8085,
1141
+ "step": 70000
1142
+ },
1143
+ {
1144
+ "epoch": 19.853562376795267,
1145
+ "grad_norm": 1.7890625,
1146
+ "learning_rate": 0.0024117150098563783,
1147
+ "loss": 6.8034,
1148
+ "step": 70500
1149
+ },
1150
+ {
1151
+ "epoch": 19.994367783722893,
1152
+ "grad_norm": 4.21875,
1153
+ "learning_rate": 0.0024004505773021685,
1154
+ "loss": 6.8024,
1155
+ "step": 71000
1156
+ },
1157
+ {
1158
+ "epoch": 20.0,
1159
+ "eval_loss": 6.794472694396973,
1160
+ "eval_runtime": 212.8014,
1161
+ "eval_samples_per_second": 9.398,
1162
+ "eval_steps_per_second": 1.175,
1163
+ "step": 71020
1164
+ },
1165
+ {
1166
+ "epoch": 20.13517319065052,
1167
+ "grad_norm": 4.21875,
1168
+ "learning_rate": 0.0023891861447479586,
1169
+ "loss": 6.7964,
1170
+ "step": 71500
1171
+ },
1172
+ {
1173
+ "epoch": 20.275978597578145,
1174
+ "grad_norm": 27.5,
1175
+ "learning_rate": 0.0023779217121937483,
1176
+ "loss": 6.8033,
1177
+ "step": 72000
1178
+ },
1179
+ {
1180
+ "epoch": 20.41678400450577,
1181
+ "grad_norm": 4.84375,
1182
+ "learning_rate": 0.0023666572796395384,
1183
+ "loss": 6.8066,
1184
+ "step": 72500
1185
+ },
1186
+ {
1187
+ "epoch": 20.557589411433398,
1188
+ "grad_norm": 10.0625,
1189
+ "learning_rate": 0.0023553928470853285,
1190
+ "loss": 6.8076,
1191
+ "step": 73000
1192
+ },
1193
+ {
1194
+ "epoch": 20.698394818361024,
1195
+ "grad_norm": 60.75,
1196
+ "learning_rate": 0.0023441284145311178,
1197
+ "loss": 6.8114,
1198
+ "step": 73500
1199
+ },
1200
+ {
1201
+ "epoch": 20.83920022528865,
1202
+ "grad_norm": 47.0,
1203
+ "learning_rate": 0.002332863981976908,
1204
+ "loss": 6.8046,
1205
+ "step": 74000
1206
+ },
1207
+ {
1208
+ "epoch": 20.980005632216276,
1209
+ "grad_norm": 4.5,
1210
+ "learning_rate": 0.002321599549422698,
1211
+ "loss": 6.8088,
1212
+ "step": 74500
1213
+ },
1214
+ {
1215
+ "epoch": 21.0,
1216
+ "eval_loss": 6.799040794372559,
1217
+ "eval_runtime": 211.5781,
1218
+ "eval_samples_per_second": 9.453,
1219
+ "eval_steps_per_second": 1.182,
1220
+ "step": 74571
1221
+ },
1222
+ {
1223
+ "epoch": 21.120811039143902,
1224
+ "grad_norm": 1.7734375,
1225
+ "learning_rate": 0.0023103351168684877,
1226
+ "loss": 6.8154,
1227
+ "step": 75000
1228
+ },
1229
+ {
1230
+ "epoch": 21.26161644607153,
1231
+ "grad_norm": 9.75,
1232
+ "learning_rate": 0.002299070684314278,
1233
+ "loss": 6.8074,
1234
+ "step": 75500
1235
+ },
1236
+ {
1237
+ "epoch": 21.402421852999154,
1238
+ "grad_norm": 12.9375,
1239
+ "learning_rate": 0.0022878062517600675,
1240
+ "loss": 6.8097,
1241
+ "step": 76000
1242
+ },
1243
+ {
1244
+ "epoch": 21.54322725992678,
1245
+ "grad_norm": 2.703125,
1246
+ "learning_rate": 0.002276541819205857,
1247
+ "loss": 6.8016,
1248
+ "step": 76500
1249
+ },
1250
+ {
1251
+ "epoch": 21.684032666854407,
1252
+ "grad_norm": 2.875,
1253
+ "learning_rate": 0.0022652773866516473,
1254
+ "loss": 6.7996,
1255
+ "step": 77000
1256
+ },
1257
+ {
1258
+ "epoch": 21.824838073782033,
1259
+ "grad_norm": 4.84375,
1260
+ "learning_rate": 0.0022540129540974375,
1261
+ "loss": 6.8058,
1262
+ "step": 77500
1263
+ },
1264
+ {
1265
+ "epoch": 21.96564348070966,
1266
+ "grad_norm": 7.15625,
1267
+ "learning_rate": 0.0022427485215432276,
1268
+ "loss": 6.8024,
1269
+ "step": 78000
1270
+ },
1271
+ {
1272
+ "epoch": 22.0,
1273
+ "eval_loss": 6.7944440841674805,
1274
+ "eval_runtime": 187.9546,
1275
+ "eval_samples_per_second": 10.641,
1276
+ "eval_steps_per_second": 1.33,
1277
+ "step": 78122
1278
+ },
1279
+ {
1280
+ "epoch": 22.106448887637285,
1281
+ "grad_norm": 4.65625,
1282
+ "learning_rate": 0.0022314840889890173,
1283
+ "loss": 6.8025,
1284
+ "step": 78500
1285
+ },
1286
+ {
1287
+ "epoch": 22.24725429456491,
1288
+ "grad_norm": 8.9375,
1289
+ "learning_rate": 0.002220219656434807,
1290
+ "loss": 6.8027,
1291
+ "step": 79000
1292
+ },
1293
+ {
1294
+ "epoch": 22.388059701492537,
1295
+ "grad_norm": 8.75,
1296
+ "learning_rate": 0.002208955223880597,
1297
+ "loss": 6.8032,
1298
+ "step": 79500
1299
+ },
1300
+ {
1301
+ "epoch": 22.528865108420163,
1302
+ "grad_norm": 10.625,
1303
+ "learning_rate": 0.0021976907913263868,
1304
+ "loss": 6.7998,
1305
+ "step": 80000
1306
+ },
1307
+ {
1308
+ "epoch": 22.66967051534779,
1309
+ "grad_norm": 12.375,
1310
+ "learning_rate": 0.002186426358772177,
1311
+ "loss": 6.8102,
1312
+ "step": 80500
1313
+ },
1314
+ {
1315
+ "epoch": 22.810475922275415,
1316
+ "grad_norm": 3.625,
1317
+ "learning_rate": 0.002175161926217967,
1318
+ "loss": 6.7944,
1319
+ "step": 81000
1320
+ },
1321
+ {
1322
+ "epoch": 22.95128132920304,
1323
+ "grad_norm": 3.484375,
1324
+ "learning_rate": 0.0021638974936637567,
1325
+ "loss": 6.8079,
1326
+ "step": 81500
1327
+ },
1328
+ {
1329
+ "epoch": 23.0,
1330
+ "eval_loss": 6.793390274047852,
1331
+ "eval_runtime": 214.7382,
1332
+ "eval_samples_per_second": 9.314,
1333
+ "eval_steps_per_second": 1.164,
1334
+ "step": 81673
1335
+ },
1336
+ {
1337
+ "epoch": 23.092086736130668,
1338
+ "grad_norm": 5.46875,
1339
+ "learning_rate": 0.0021526330611095464,
1340
+ "loss": 6.8065,
1341
+ "step": 82000
1342
+ },
1343
+ {
1344
+ "epoch": 23.232892143058294,
1345
+ "grad_norm": 3.125,
1346
+ "learning_rate": 0.0021413686285553365,
1347
+ "loss": 6.8017,
1348
+ "step": 82500
1349
+ },
1350
+ {
1351
+ "epoch": 23.37369754998592,
1352
+ "grad_norm": 77.5,
1353
+ "learning_rate": 0.0021301041960011266,
1354
+ "loss": 6.8055,
1355
+ "step": 83000
1356
+ },
1357
+ {
1358
+ "epoch": 23.514502956913546,
1359
+ "grad_norm": 1.5390625,
1360
+ "learning_rate": 0.0021188397634469163,
1361
+ "loss": 6.8064,
1362
+ "step": 83500
1363
+ },
1364
+ {
1365
+ "epoch": 23.655308363841172,
1366
+ "grad_norm": 3.28125,
1367
+ "learning_rate": 0.0021075753308927065,
1368
+ "loss": 6.8029,
1369
+ "step": 84000
1370
+ },
1371
+ {
1372
+ "epoch": 23.796113770768798,
1373
+ "grad_norm": 4.5,
1374
+ "learning_rate": 0.0020963108983384966,
1375
+ "loss": 6.7993,
1376
+ "step": 84500
1377
+ },
1378
+ {
1379
+ "epoch": 23.936919177696424,
1380
+ "grad_norm": 3.078125,
1381
+ "learning_rate": 0.002085046465784286,
1382
+ "loss": 6.7938,
1383
+ "step": 85000
1384
+ },
1385
+ {
1386
+ "epoch": 24.0,
1387
+ "eval_loss": 6.793313503265381,
1388
+ "eval_runtime": 192.9768,
1389
+ "eval_samples_per_second": 10.364,
1390
+ "eval_steps_per_second": 1.295,
1391
+ "step": 85224
1392
+ },
1393
+ {
1394
+ "epoch": 24.07772458462405,
1395
+ "grad_norm": 16.875,
1396
+ "learning_rate": 0.002073782033230076,
1397
+ "loss": 6.8043,
1398
+ "step": 85500
1399
+ },
1400
+ {
1401
+ "epoch": 24.218529991551677,
1402
+ "grad_norm": 7.0,
1403
+ "learning_rate": 0.002062517600675866,
1404
+ "loss": 6.7964,
1405
+ "step": 86000
1406
+ },
1407
+ {
1408
+ "epoch": 24.359335398479303,
1409
+ "grad_norm": 4.46875,
1410
+ "learning_rate": 0.0020512531681216558,
1411
+ "loss": 6.8079,
1412
+ "step": 86500
1413
+ },
1414
+ {
1415
+ "epoch": 24.50014080540693,
1416
+ "grad_norm": 3.1875,
1417
+ "learning_rate": 0.002039988735567446,
1418
+ "loss": 6.8029,
1419
+ "step": 87000
1420
+ },
1421
+ {
1422
+ "epoch": 24.640946212334555,
1423
+ "grad_norm": 3.203125,
1424
+ "learning_rate": 0.002028724303013236,
1425
+ "loss": 6.7963,
1426
+ "step": 87500
1427
+ },
1428
+ {
1429
+ "epoch": 24.78175161926218,
1430
+ "grad_norm": 8.9375,
1431
+ "learning_rate": 0.0020174598704590253,
1432
+ "loss": 6.8056,
1433
+ "step": 88000
1434
+ },
1435
+ {
1436
+ "epoch": 24.922557026189807,
1437
+ "grad_norm": 2.859375,
1438
+ "learning_rate": 0.0020061954379048154,
1439
+ "loss": 6.8061,
1440
+ "step": 88500
1441
+ },
1442
+ {
1443
+ "epoch": 25.0,
1444
+ "eval_loss": 6.796795845031738,
1445
+ "eval_runtime": 181.8838,
1446
+ "eval_samples_per_second": 10.996,
1447
+ "eval_steps_per_second": 1.375,
1448
+ "step": 88775
1449
+ },
1450
+ {
1451
+ "epoch": 25.063362433117433,
1452
+ "grad_norm": 6.09375,
1453
+ "learning_rate": 0.0019949310053506055,
1454
+ "loss": 6.7973,
1455
+ "step": 89000
1456
+ },
1457
+ {
1458
+ "epoch": 25.20416784004506,
1459
+ "grad_norm": 1.09375,
1460
+ "learning_rate": 0.001983666572796395,
1461
+ "loss": 6.8059,
1462
+ "step": 89500
1463
+ },
1464
+ {
1465
+ "epoch": 25.344973246972685,
1466
+ "grad_norm": 24.375,
1467
+ "learning_rate": 0.0019724021402421853,
1468
+ "loss": 6.8048,
1469
+ "step": 90000
1470
+ },
1471
+ {
1472
+ "epoch": 25.48577865390031,
1473
+ "grad_norm": 3.171875,
1474
+ "learning_rate": 0.0019611377076879754,
1475
+ "loss": 6.7963,
1476
+ "step": 90500
1477
+ },
1478
+ {
1479
+ "epoch": 25.626584060827938,
1480
+ "grad_norm": 1.546875,
1481
+ "learning_rate": 0.0019498732751337651,
1482
+ "loss": 6.7984,
1483
+ "step": 91000
1484
+ },
1485
+ {
1486
+ "epoch": 25.76738946775556,
1487
+ "grad_norm": 24.375,
1488
+ "learning_rate": 0.0019386088425795553,
1489
+ "loss": 6.7948,
1490
+ "step": 91500
1491
+ },
1492
+ {
1493
+ "epoch": 25.908194874683186,
1494
+ "grad_norm": 29.125,
1495
+ "learning_rate": 0.001927344410025345,
1496
+ "loss": 6.8014,
1497
+ "step": 92000
1498
+ },
1499
+ {
1500
+ "epoch": 26.0,
1501
+ "eval_loss": 6.788823127746582,
1502
+ "eval_runtime": 205.9541,
1503
+ "eval_samples_per_second": 9.711,
1504
+ "eval_steps_per_second": 1.214,
1505
+ "step": 92326
1506
+ },
1507
+ {
1508
+ "epoch": 26.049000281610812,
1509
+ "grad_norm": 3.046875,
1510
+ "learning_rate": 0.0019160799774711349,
1511
+ "loss": 6.794,
1512
+ "step": 92500
1513
+ },
1514
+ {
1515
+ "epoch": 26.18980568853844,
1516
+ "grad_norm": 2.1875,
1517
+ "learning_rate": 0.001904815544916925,
1518
+ "loss": 6.7969,
1519
+ "step": 93000
1520
+ },
1521
+ {
1522
+ "epoch": 26.330611095466065,
1523
+ "grad_norm": 3.78125,
1524
+ "learning_rate": 0.0018935511123627147,
1525
+ "loss": 6.7898,
1526
+ "step": 93500
1527
+ },
1528
+ {
1529
+ "epoch": 26.47141650239369,
1530
+ "grad_norm": 20.875,
1531
+ "learning_rate": 0.0018822866798085048,
1532
+ "loss": 6.7979,
1533
+ "step": 94000
1534
+ },
1535
+ {
1536
+ "epoch": 26.612221909321317,
1537
+ "grad_norm": 3.21875,
1538
+ "learning_rate": 0.0018710222472542947,
1539
+ "loss": 6.7962,
1540
+ "step": 94500
1541
+ },
1542
+ {
1543
+ "epoch": 26.753027316248943,
1544
+ "grad_norm": 11.375,
1545
+ "learning_rate": 0.0018597578147000844,
1546
+ "loss": 6.7987,
1547
+ "step": 95000
1548
+ },
1549
+ {
1550
+ "epoch": 26.89383272317657,
1551
+ "grad_norm": 8.875,
1552
+ "learning_rate": 0.0018484933821458745,
1553
+ "loss": 6.801,
1554
+ "step": 95500
1555
+ },
1556
+ {
1557
+ "epoch": 27.0,
1558
+ "eval_loss": 6.789410591125488,
1559
+ "eval_runtime": 227.3793,
1560
+ "eval_samples_per_second": 8.796,
1561
+ "eval_steps_per_second": 1.099,
1562
+ "step": 95877
1563
+ },
1564
+ {
1565
+ "epoch": 27.034638130104195,
1566
+ "grad_norm": 1.46875,
1567
+ "learning_rate": 0.0018372289495916644,
1568
+ "loss": 6.7975,
1569
+ "step": 96000
1570
+ },
1571
+ {
1572
+ "epoch": 27.17544353703182,
1573
+ "grad_norm": 80.5,
1574
+ "learning_rate": 0.0018259645170374543,
1575
+ "loss": 6.7984,
1576
+ "step": 96500
1577
+ },
1578
+ {
1579
+ "epoch": 27.316248943959447,
1580
+ "grad_norm": 2.671875,
1581
+ "learning_rate": 0.0018147000844832442,
1582
+ "loss": 6.7878,
1583
+ "step": 97000
1584
+ },
1585
+ {
1586
+ "epoch": 27.457054350887073,
1587
+ "grad_norm": 9.0,
1588
+ "learning_rate": 0.0018034356519290341,
1589
+ "loss": 6.7973,
1590
+ "step": 97500
1591
+ },
1592
+ {
1593
+ "epoch": 27.5978597578147,
1594
+ "grad_norm": 2.1875,
1595
+ "learning_rate": 0.001792171219374824,
1596
+ "loss": 6.7993,
1597
+ "step": 98000
1598
+ },
1599
+ {
1600
+ "epoch": 27.738665164742326,
1601
+ "grad_norm": 5.65625,
1602
+ "learning_rate": 0.001780906786820614,
1603
+ "loss": 6.7979,
1604
+ "step": 98500
1605
+ },
1606
+ {
1607
+ "epoch": 27.879470571669952,
1608
+ "grad_norm": 6.53125,
1609
+ "learning_rate": 0.001769642354266404,
1610
+ "loss": 6.7953,
1611
+ "step": 99000
1612
+ },
1613
+ {
1614
+ "epoch": 28.0,
1615
+ "eval_loss": 6.787979602813721,
1616
+ "eval_runtime": 193.1872,
1617
+ "eval_samples_per_second": 10.353,
1618
+ "eval_steps_per_second": 1.294,
1619
+ "step": 99428
1620
+ },
1621
+ {
1622
+ "epoch": 28.020275978597578,
1623
+ "grad_norm": 8.5625,
1624
+ "learning_rate": 0.0017583779217121938,
1625
+ "loss": 6.7974,
1626
+ "step": 99500
1627
+ },
1628
+ {
1629
+ "epoch": 28.161081385525204,
1630
+ "grad_norm": 7.25,
1631
+ "learning_rate": 0.0017471134891579837,
1632
+ "loss": 6.7964,
1633
+ "step": 100000
1634
+ },
1635
+ {
1636
+ "epoch": 28.30188679245283,
1637
+ "grad_norm": 5.5625,
1638
+ "learning_rate": 0.0017358490566037738,
1639
+ "loss": 6.7967,
1640
+ "step": 100500
1641
+ },
1642
+ {
1643
+ "epoch": 28.442692199380456,
1644
+ "grad_norm": 3.34375,
1645
+ "learning_rate": 0.0017245846240495635,
1646
+ "loss": 6.7968,
1647
+ "step": 101000
1648
+ },
1649
+ {
1650
+ "epoch": 28.583497606308082,
1651
+ "grad_norm": 8.3125,
1652
+ "learning_rate": 0.0017133201914953536,
1653
+ "loss": 6.7985,
1654
+ "step": 101500
1655
+ },
1656
+ {
1657
+ "epoch": 28.72430301323571,
1658
+ "grad_norm": 32.25,
1659
+ "learning_rate": 0.0017020557589411435,
1660
+ "loss": 6.7975,
1661
+ "step": 102000
1662
+ },
1663
+ {
1664
+ "epoch": 28.865108420163335,
1665
+ "grad_norm": 5.25,
1666
+ "learning_rate": 0.0016907913263869332,
1667
+ "loss": 6.7887,
1668
+ "step": 102500
1669
+ },
1670
+ {
1671
+ "epoch": 29.0,
1672
+ "eval_loss": 6.7877984046936035,
1673
+ "eval_runtime": 234.4697,
1674
+ "eval_samples_per_second": 8.53,
1675
+ "eval_steps_per_second": 1.066,
1676
+ "step": 102979
1677
+ },
1678
+ {
1679
+ "epoch": 29.00591382709096,
1680
+ "grad_norm": 16.75,
1681
+ "learning_rate": 0.0016795268938327233,
1682
+ "loss": 6.7878,
1683
+ "step": 103000
1684
+ },
1685
+ {
1686
+ "epoch": 29.146719234018587,
1687
+ "grad_norm": 4.65625,
1688
+ "learning_rate": 0.0016682624612785132,
1689
+ "loss": 6.7852,
1690
+ "step": 103500
1691
+ },
1692
+ {
1693
+ "epoch": 29.287524640946213,
1694
+ "grad_norm": 8.0625,
1695
+ "learning_rate": 0.001656998028724303,
1696
+ "loss": 6.7974,
1697
+ "step": 104000
1698
+ },
1699
+ {
1700
+ "epoch": 29.42833004787384,
1701
+ "grad_norm": 7.3125,
1702
+ "learning_rate": 0.001645733596170093,
1703
+ "loss": 6.8057,
1704
+ "step": 104500
1705
+ },
1706
+ {
1707
+ "epoch": 29.569135454801465,
1708
+ "grad_norm": 1.0546875,
1709
+ "learning_rate": 0.001634469163615883,
1710
+ "loss": 6.7982,
1711
+ "step": 105000
1712
+ },
1713
+ {
1714
+ "epoch": 29.70994086172909,
1715
+ "grad_norm": 11.125,
1716
+ "learning_rate": 0.0016232047310616728,
1717
+ "loss": 6.7948,
1718
+ "step": 105500
1719
+ },
1720
+ {
1721
+ "epoch": 29.850746268656717,
1722
+ "grad_norm": 9.8125,
1723
+ "learning_rate": 0.0016119402985074627,
1724
+ "loss": 6.7843,
1725
+ "step": 106000
1726
+ },
1727
+ {
1728
+ "epoch": 29.991551675584343,
1729
+ "grad_norm": 4.8125,
1730
+ "learning_rate": 0.0016006758659532524,
1731
+ "loss": 6.8003,
1732
+ "step": 106500
1733
+ },
1734
+ {
1735
+ "epoch": 30.0,
1736
+ "eval_loss": 6.787229061126709,
1737
+ "eval_runtime": 216.4819,
1738
+ "eval_samples_per_second": 9.239,
1739
+ "eval_steps_per_second": 1.155,
1740
+ "step": 106530
1741
+ }
1742
+ ],
1743
+ "logging_steps": 500,
1744
+ "max_steps": 177550,
1745
+ "num_input_tokens_seen": 0,
1746
+ "num_train_epochs": 50,
1747
+ "save_steps": 500,
1748
+ "stateful_callbacks": {
1749
+ "EarlyStoppingCallback": {
1750
+ "args": {
1751
+ "early_stopping_patience": 3,
1752
+ "early_stopping_threshold": 0.0
1753
+ },
1754
+ "attributes": {
1755
+ "early_stopping_patience_counter": 0
1756
+ }
1757
+ },
1758
+ "TrainerControl": {
1759
+ "args": {
1760
+ "should_epoch_stop": false,
1761
+ "should_evaluate": false,
1762
+ "should_log": false,
1763
+ "should_save": true,
1764
+ "should_training_stop": false
1765
+ },
1766
+ "attributes": {}
1767
+ }
1768
+ },
1769
+ "total_flos": 4.342179461275713e+18,
1770
+ "train_batch_size": 8,
1771
+ "trial_name": null,
1772
+ "trial_params": null
1773
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f7e64ba741ef0312bb7c236e2c187fd1f39c4a35b12a3a95d748b94a0d3a9c
3
+ size 5112