emdemor commited on
Commit
dee7540
1 Parent(s): 519d7e6

Training in progress, step 50

Browse files
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "o_proj",
25
  "q_proj",
26
- "up_proj",
27
  "down_proj",
28
  "gate_proj",
29
- "v_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "o_proj",
24
  "q_proj",
 
25
  "down_proj",
26
  "gate_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ba183c1c97dd8fd79734a2fb1011526a1bc0695f6a38a908cc3cc23a2265e55
3
  size 35668592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf964742964d2ec503f25da4ead7a6c8a5691fc8d1cb60aa46b3a6fe8d444f8
3
  size 35668592
metrics.json CHANGED
@@ -1,18 +1 @@
1
- {"Step":20,"eval_loss":1.9001051188,"eval_runtime":1521.8531,"eval_samples_per_second":3.285,"eval_steps_per_second":0.411,"epoch":0.032}
2
- {"Step":40,"eval_loss":1.8370074034,"eval_runtime":1516.9668,"eval_samples_per_second":3.296,"eval_steps_per_second":0.412,"epoch":0.064}
3
- {"Step":60,"eval_loss":1.6439222097,"eval_runtime":1500.7581,"eval_samples_per_second":3.332,"eval_steps_per_second":0.416,"epoch":0.096}
4
- {"Step":80,"eval_loss":1.550334692,"eval_runtime":1506.1101,"eval_samples_per_second":3.32,"eval_steps_per_second":0.415,"epoch":0.128}
5
- {"Step":100,"eval_loss":1.519143939,"eval_runtime":1499.8012,"eval_samples_per_second":3.334,"eval_steps_per_second":0.417,"epoch":0.16}
6
- {"Step":120,"eval_loss":1.5071601868,"eval_runtime":1498.469,"eval_samples_per_second":3.337,"eval_steps_per_second":0.417,"epoch":0.192}
7
- {"Step":140,"eval_loss":1.4998296499,"eval_runtime":1486.0248,"eval_samples_per_second":3.365,"eval_steps_per_second":0.421,"epoch":0.224}
8
- {"Step":160,"eval_loss":1.4962984324,"eval_runtime":1491.4174,"eval_samples_per_second":3.353,"eval_steps_per_second":0.419,"epoch":0.256}
9
- {"Step":180,"eval_loss":1.4931029081,"eval_runtime":1501.5451,"eval_samples_per_second":3.33,"eval_steps_per_second":0.416,"epoch":0.288}
10
- {"Step":200,"eval_loss":1.4910326004,"eval_runtime":1501.9275,"eval_samples_per_second":3.329,"eval_steps_per_second":0.416,"epoch":0.32}
11
- {"Step":220,"eval_loss":1.488992691,"eval_runtime":1498.0439,"eval_samples_per_second":3.338,"eval_steps_per_second":0.417,"epoch":0.352}
12
- {"Step":240,"eval_loss":1.4867918491,"eval_runtime":1460.1808,"eval_samples_per_second":3.424,"eval_steps_per_second":0.428,"epoch":0.384}
13
- {"Step":260,"eval_loss":1.4859720469,"eval_runtime":1460.3853,"eval_samples_per_second":3.424,"eval_steps_per_second":0.428,"epoch":0.416}
14
- {"Step":280,"eval_loss":1.4846266508,"eval_runtime":1460.0793,"eval_samples_per_second":3.424,"eval_steps_per_second":0.428,"epoch":0.448}
15
- {"Step":300,"eval_loss":1.4849002361,"eval_runtime":1472.1795,"eval_samples_per_second":3.396,"eval_steps_per_second":0.425,"epoch":0.48}
16
- {"Step":320,"eval_loss":1.4825308323,"eval_runtime":1500.8065,"eval_samples_per_second":3.332,"eval_steps_per_second":0.416,"epoch":0.512}
17
- {"Step":340,"eval_loss":1.4830516577,"eval_runtime":1501.5622,"eval_samples_per_second":3.33,"eval_steps_per_second":0.416,"epoch":0.544}
18
- {"Step":360,"eval_loss":1.4818130732,"eval_runtime":1501.7616,"eval_samples_per_second":3.329,"eval_steps_per_second":0.416,"epoch":0.576}
 
1
+ {"Step":50,"eval_loss":1.7156720161,"eval_runtime":149.1859,"eval_samples_per_second":3.352,"eval_steps_per_second":0.422,"epoch":0.08}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
state.json CHANGED
@@ -1,289 +1,34 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.592,
5
- "eval_steps": 20,
6
- "global_step": 370,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.032,
13
- "grad_norm": 0.015696076676249504,
14
- "learning_rate": 5.319148936170213e-05,
15
- "loss": 1.9359,
16
- "step": 20
17
  },
18
  {
19
- "epoch": 0.032,
20
- "eval_loss": 1.9001051187515259,
21
- "eval_runtime": 1521.8531,
22
- "eval_samples_per_second": 3.285,
23
- "eval_steps_per_second": 0.411,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.064,
28
- "grad_norm": 0.039251815527677536,
29
- "learning_rate": 0.00010638297872340425,
30
- "loss": 1.9075,
31
- "step": 40
32
- },
33
- {
34
- "epoch": 0.064,
35
- "eval_loss": 1.8370074033737183,
36
- "eval_runtime": 1516.9668,
37
- "eval_samples_per_second": 3.296,
38
- "eval_steps_per_second": 0.412,
39
- "step": 40
40
- },
41
- {
42
- "epoch": 0.096,
43
- "grad_norm": NaN,
44
- "learning_rate": 0.00015691489361702128,
45
- "loss": 1.7374,
46
- "step": 60
47
- },
48
- {
49
- "epoch": 0.096,
50
- "eval_loss": 1.643922209739685,
51
- "eval_runtime": 1500.7581,
52
- "eval_samples_per_second": 3.332,
53
- "eval_steps_per_second": 0.416,
54
- "step": 60
55
- },
56
- {
57
- "epoch": 0.128,
58
- "grad_norm": 0.10252918303012848,
59
- "learning_rate": 0.0002101063829787234,
60
- "loss": 1.6047,
61
- "step": 80
62
- },
63
- {
64
- "epoch": 0.128,
65
- "eval_loss": 1.5503346920013428,
66
- "eval_runtime": 1506.1101,
67
- "eval_samples_per_second": 3.32,
68
- "eval_steps_per_second": 0.415,
69
- "step": 80
70
- },
71
- {
72
- "epoch": 0.16,
73
- "grad_norm": 0.026087351143360138,
74
- "learning_rate": 0.0002632978723404255,
75
- "loss": 1.539,
76
- "step": 100
77
- },
78
- {
79
- "epoch": 0.16,
80
- "eval_loss": 1.5191439390182495,
81
- "eval_runtime": 1499.8012,
82
- "eval_samples_per_second": 3.334,
83
- "eval_steps_per_second": 0.417,
84
- "step": 100
85
- },
86
- {
87
- "epoch": 0.192,
88
- "grad_norm": 0.03835824504494667,
89
- "learning_rate": 0.00031648936170212765,
90
- "loss": 1.5149,
91
- "step": 120
92
- },
93
- {
94
- "epoch": 0.192,
95
- "eval_loss": 1.5071601867675781,
96
- "eval_runtime": 1498.469,
97
- "eval_samples_per_second": 3.337,
98
- "eval_steps_per_second": 0.417,
99
- "step": 120
100
- },
101
- {
102
- "epoch": 0.224,
103
- "grad_norm": 0.03541890159249306,
104
- "learning_rate": 0.0003696808510638298,
105
- "loss": 1.5267,
106
- "step": 140
107
- },
108
- {
109
- "epoch": 0.224,
110
- "eval_loss": 1.499829649925232,
111
- "eval_runtime": 1486.0248,
112
- "eval_samples_per_second": 3.365,
113
- "eval_steps_per_second": 0.421,
114
- "step": 140
115
- },
116
- {
117
- "epoch": 0.256,
118
- "grad_norm": 0.045721374452114105,
119
- "learning_rate": 0.0004228723404255319,
120
- "loss": 1.4927,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 0.256,
125
- "eval_loss": 1.4962984323501587,
126
- "eval_runtime": 1491.4174,
127
- "eval_samples_per_second": 3.353,
128
- "eval_steps_per_second": 0.419,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.288,
133
- "grad_norm": 0.04470464587211609,
134
- "learning_rate": 0.0004760638297872341,
135
- "loss": 1.4861,
136
- "step": 180
137
- },
138
- {
139
- "epoch": 0.288,
140
- "eval_loss": 1.4931029081344604,
141
- "eval_runtime": 1501.5451,
142
- "eval_samples_per_second": 3.33,
143
- "eval_steps_per_second": 0.416,
144
- "step": 180
145
- },
146
- {
147
- "epoch": 0.32,
148
- "grad_norm": 0.04011245444417,
149
- "learning_rate": 0.0004967397747480735,
150
- "loss": 1.4815,
151
- "step": 200
152
- },
153
- {
154
- "epoch": 0.32,
155
- "eval_loss": 1.491032600402832,
156
- "eval_runtime": 1501.9275,
157
- "eval_samples_per_second": 3.329,
158
- "eval_steps_per_second": 0.416,
159
- "step": 200
160
- },
161
- {
162
- "epoch": 0.352,
163
- "grad_norm": 0.03507564216852188,
164
- "learning_rate": 0.0004908120924718435,
165
- "loss": 1.5256,
166
- "step": 220
167
- },
168
- {
169
- "epoch": 0.352,
170
- "eval_loss": 1.488992691040039,
171
- "eval_runtime": 1498.0439,
172
- "eval_samples_per_second": 3.338,
173
- "eval_steps_per_second": 0.417,
174
- "step": 220
175
- },
176
- {
177
- "epoch": 0.384,
178
- "grad_norm": 0.03816624730825424,
179
- "learning_rate": 0.0004848844101956135,
180
- "loss": 1.4932,
181
- "step": 240
182
- },
183
- {
184
- "epoch": 0.384,
185
- "eval_loss": 1.4867918491363525,
186
- "eval_runtime": 1460.1808,
187
- "eval_samples_per_second": 3.424,
188
- "eval_steps_per_second": 0.428,
189
- "step": 240
190
- },
191
- {
192
- "epoch": 0.416,
193
- "grad_norm": 0.03537523001432419,
194
- "learning_rate": 0.00047895672791938357,
195
- "loss": 1.4747,
196
- "step": 260
197
- },
198
- {
199
- "epoch": 0.416,
200
- "eval_loss": 1.4859720468521118,
201
- "eval_runtime": 1460.3853,
202
- "eval_samples_per_second": 3.424,
203
- "eval_steps_per_second": 0.428,
204
- "step": 260
205
- },
206
- {
207
- "epoch": 0.448,
208
- "grad_norm": 0.037848278880119324,
209
- "learning_rate": 0.0004730290456431535,
210
- "loss": 1.4635,
211
- "step": 280
212
- },
213
- {
214
- "epoch": 0.448,
215
- "eval_loss": 1.4846266508102417,
216
- "eval_runtime": 1460.0793,
217
- "eval_samples_per_second": 3.424,
218
- "eval_steps_per_second": 0.428,
219
- "step": 280
220
- },
221
- {
222
- "epoch": 0.48,
223
- "grad_norm": 0.038794029504060745,
224
- "learning_rate": 0.00046710136336692356,
225
- "loss": 1.5055,
226
- "step": 300
227
- },
228
- {
229
- "epoch": 0.48,
230
- "eval_loss": 1.4849002361297607,
231
- "eval_runtime": 1472.1795,
232
- "eval_samples_per_second": 3.396,
233
- "eval_steps_per_second": 0.425,
234
- "step": 300
235
- },
236
- {
237
- "epoch": 0.512,
238
- "grad_norm": 0.03938526660203934,
239
- "learning_rate": 0.00046117368109069355,
240
- "loss": 1.4996,
241
- "step": 320
242
- },
243
- {
244
- "epoch": 0.512,
245
- "eval_loss": 1.4825308322906494,
246
- "eval_runtime": 1500.8065,
247
- "eval_samples_per_second": 3.332,
248
- "eval_steps_per_second": 0.416,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.544,
253
- "grad_norm": 0.041331056505441666,
254
- "learning_rate": 0.00045524599881446355,
255
- "loss": 1.4742,
256
- "step": 340
257
- },
258
- {
259
- "epoch": 0.544,
260
- "eval_loss": 1.4830516576766968,
261
- "eval_runtime": 1501.5622,
262
- "eval_samples_per_second": 3.33,
263
- "eval_steps_per_second": 0.416,
264
- "step": 340
265
- },
266
- {
267
- "epoch": 0.576,
268
- "grad_norm": 0.03991026058793068,
269
- "learning_rate": 0.0004493183165382336,
270
- "loss": 1.4809,
271
- "step": 360
272
- },
273
- {
274
- "epoch": 0.576,
275
- "eval_loss": 1.4818130731582642,
276
- "eval_runtime": 1501.7616,
277
- "eval_samples_per_second": 3.329,
278
- "eval_steps_per_second": 0.416,
279
- "step": 360
280
  }
281
  ],
282
- "logging_steps": 20,
283
  "max_steps": 1875,
284
  "num_input_tokens_seen": 0,
285
  "num_train_epochs": 3,
286
- "save_steps": 10,
287
  "stateful_callbacks": {
288
  "TrainerControl": {
289
  "args": {
@@ -296,7 +41,7 @@
296
  "attributes": {}
297
  }
298
  },
299
- "total_flos": 2.892365895794688e+16,
300
  "train_batch_size": 8,
301
  "trial_name": null,
302
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0816,
5
+ "eval_steps": 50,
6
+ "global_step": 51,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08,
13
+ "grad_norm": 0.05632242560386658,
14
+ "learning_rate": 0.00013297872340425532,
15
+ "loss": 1.8852,
16
+ "step": 50
17
  },
18
  {
19
+ "epoch": 0.08,
20
+ "eval_loss": 1.7156720161437988,
21
+ "eval_runtime": 149.1859,
22
+ "eval_samples_per_second": 3.352,
23
+ "eval_steps_per_second": 0.422,
24
+ "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ],
27
+ "logging_steps": 50,
28
  "max_steps": 1875,
29
  "num_input_tokens_seen": 0,
30
  "num_train_epochs": 3,
31
+ "save_steps": 50,
32
  "stateful_callbacks": {
33
  "TrainerControl": {
34
  "args": {
 
41
  "attributes": {}
42
  }
43
  },
44
+ "total_flos": 4002786222735360.0,
45
  "train_batch_size": 8,
46
  "trial_name": null,
47
  "trial_params": null
tokenizer_config.json CHANGED
@@ -121,11 +121,15 @@
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
 
124
  "model_max_length": 4096,
125
  "pad_token": "<|endoftext|>",
126
  "padding_side": "right",
127
  "sp_model_kwargs": {},
 
128
  "tokenizer_class": "LlamaTokenizer",
 
 
129
  "unk_token": "<unk>",
130
  "use_default_system_prompt": false
131
  }
 
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
+ "max_length": 512,
125
  "model_max_length": 4096,
126
  "pad_token": "<|endoftext|>",
127
  "padding_side": "right",
128
  "sp_model_kwargs": {},
129
+ "stride": 0,
130
  "tokenizer_class": "LlamaTokenizer",
131
+ "truncation_side": "right",
132
+ "truncation_strategy": "longest_first",
133
  "unk_token": "<unk>",
134
  "use_default_system_prompt": false
135
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4898537838146fbf12c74d87a475327b6a5606d9348748214e9ab9ceaa968a02
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6aadb86e7c11eaf53076fa580bc16d46538da6411436f54844013f1933386b8
3
  size 5432