terry69 commited on
Commit
6028a84
1 Parent(s): 6530003

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,11 @@
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
12
  datasets:
13
- - preference-data
14
  model-index:
15
  - name: preference_0.1p_seed42_level3_rare
16
  results: []
@@ -21,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # preference_0.1p_seed42_level3_rare
23
 
24
- This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the preference-data dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.7365
27
 
28
  ## Model description
29
 
@@ -60,7 +56,7 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:------:|:----:|:---------------:|
63
- | 0.5575 | 0.9967 | 229 | 0.7365 |
64
 
65
 
66
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: preference_0.1p_seed42_level3_rare
12
  results: []
 
17
 
18
  # preference_0.1p_seed42_level3_rare
19
 
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.7390
23
 
24
  ## Model description
25
 
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:------:|:----:|:---------------:|
59
+ | 0.6207 | 0.9979 | 234 | 0.7390 |
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9967355821545157,
3
- "eval_loss": 0.7364564538002014,
4
- "eval_runtime": 106.4286,
5
- "eval_samples": 999,
6
- "eval_samples_per_second": 3.664,
7
- "eval_steps_per_second": 0.921,
8
- "total_flos": 47895596236800.0,
9
- "train_loss": 0.7106172507506792,
10
- "train_runtime": 7254.3633,
11
  "train_samples": 18788,
12
- "train_samples_per_second": 1.013,
13
  "train_steps_per_second": 0.032
14
  }
 
1
  {
2
+ "epoch": 0.997867803837953,
3
+ "total_flos": 48942494515200.0,
4
+ "train_loss": 0.7695284368645432,
5
+ "train_runtime": 7306.9109,
 
 
 
 
 
6
  "train_samples": 18788,
7
+ "train_samples_per_second": 1.026,
8
  "train_steps_per_second": 0.032
9
  }
config.json CHANGED
@@ -22,6 +22,6 @@
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.43.4",
25
- "use_cache": true,
26
  "vocab_size": 32000
27
  }
 
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.43.4",
25
+ "use_cache": false,
26
  "vocab_size": 32000
27
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d48bf1eb9b91e3e33ef4b68df0b104dc5ed8a1c29489fdf24d1112e53c1873dc
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86f4d7f7f71205d776a15d9bb271ef8e4c5b85a4464a7d9cc8fabf5abbe6742a
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e966f0e6454b578cc2ce0bfc4e1d4d089ffa63860cbff54dd59cc1a4bea67a6
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d3da52201a469e680648b0686698f7ed8cc2c1f0cf1fb25f271d2e05cdf9b69
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f001e5166a1403d88df7fb44a8b577563bfa911db02dc5cb3a05a62420984129
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e053a2e5d6ca1464762eb049d3dfabc9f0e6457f3bc4384424e2868e4071f4ba
3
  size 4540516344
runs/Aug13_13-34-23_COE-CS-sv004/events.out.tfevents.1723556233.COE-CS-sv004.2672049.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af08c6c58d2a3c6ef9cf9bc2649171c3215d1f8b4368f5fd3012ccbca822bf94
3
+ size 15669
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9967355821545157,
3
- "total_flos": 47895596236800.0,
4
- "train_loss": 0.7106172507506792,
5
- "train_runtime": 7254.3633,
6
  "train_samples": 18788,
7
- "train_samples_per_second": 1.013,
8
  "train_steps_per_second": 0.032
9
  }
 
1
  {
2
+ "epoch": 0.997867803837953,
3
+ "total_flos": 48942494515200.0,
4
+ "train_loss": 0.7695284368645432,
5
+ "train_runtime": 7306.9109,
6
  "train_samples": 18788,
7
+ "train_samples_per_second": 1.026,
8
  "train_steps_per_second": 0.032
9
  }
trainer_state.json CHANGED
@@ -1,355 +1,362 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9967355821545157,
5
  "eval_steps": 500,
6
- "global_step": 229,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.004352557127312296,
13
- "grad_norm": 23.969777401039607,
14
- "learning_rate": 4.347826086956522e-07,
15
- "loss": 1.4137,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.02176278563656148,
20
- "grad_norm": 8.67616469459428,
21
- "learning_rate": 2.173913043478261e-06,
22
- "loss": 1.3829,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.04352557127312296,
27
- "grad_norm": 3.5647414259572265,
28
- "learning_rate": 4.347826086956522e-06,
29
- "loss": 1.1134,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.06528835690968444,
34
- "grad_norm": 3.2984928713958666,
35
- "learning_rate": 6.521739130434783e-06,
36
- "loss": 0.9599,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.08705114254624592,
41
- "grad_norm": 2.725944554945545,
42
- "learning_rate": 8.695652173913044e-06,
43
- "loss": 0.924,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.1088139281828074,
48
- "grad_norm": 2.690177981501858,
49
- "learning_rate": 9.997674418116759e-06,
50
- "loss": 0.8916,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.1305767138193689,
55
- "grad_norm": 2.575585422886906,
56
- "learning_rate": 9.971536460096021e-06,
57
- "loss": 0.8802,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.15233949945593037,
62
- "grad_norm": 2.4168957628954746,
63
- "learning_rate": 9.916505976821262e-06,
64
- "loss": 0.8682,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.17410228509249184,
69
- "grad_norm": 2.597564344293517,
70
- "learning_rate": 9.832902782828801e-06,
71
- "loss": 0.8501,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.19586507072905332,
76
- "grad_norm": 2.8200473310088534,
77
- "learning_rate": 9.721212745498493e-06,
78
- "loss": 0.8429,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.2176278563656148,
83
- "grad_norm": 2.528361018534502,
84
- "learning_rate": 9.582084961392358e-06,
85
- "loss": 0.8167,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.23939064200217627,
90
- "grad_norm": 2.3876192230621047,
91
- "learning_rate": 9.416327983972304e-06,
92
- "loss": 0.8087,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.2611534276387378,
97
- "grad_norm": 2.4893211482371767,
98
- "learning_rate": 9.22490512461995e-06,
99
- "loss": 0.8027,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.28291621327529926,
104
- "grad_norm": 2.958049999378203,
105
- "learning_rate": 9.008928854267054e-06,
106
- "loss": 0.7661,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.30467899891186073,
111
- "grad_norm": 3.0632090846251523,
112
- "learning_rate": 8.769654338171986e-06,
113
- "loss": 0.7773,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.3264417845484222,
118
- "grad_norm": 2.6123024349921926,
119
- "learning_rate": 8.508472141415468e-06,
120
- "loss": 0.7753,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.3482045701849837,
125
- "grad_norm": 2.695553953076791,
126
- "learning_rate": 8.226900147508205e-06,
127
- "loss": 0.7504,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.36996735582154516,
132
- "grad_norm": 3.1757782840030604,
133
- "learning_rate": 7.92657473707621e-06,
134
- "loss": 0.7358,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.39173014145810664,
139
- "grad_norm": 2.4347947902045326,
140
- "learning_rate": 7.609241277889583e-06,
141
- "loss": 0.738,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.4134929270946681,
146
- "grad_norm": 2.4759060035072924,
147
- "learning_rate": 7.276743981502856e-06,
148
- "loss": 0.7013,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.4352557127312296,
153
- "grad_norm": 2.4837499004026506,
154
- "learning_rate": 6.931015185455915e-06,
155
- "loss": 0.7042,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.45701849836779107,
160
- "grad_norm": 2.589964756825322,
161
- "learning_rate": 6.574064123322925e-06,
162
- "loss": 0.6864,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.47878128400435255,
167
- "grad_norm": 2.4839924992380187,
168
- "learning_rate": 6.207965247873151e-06,
169
- "loss": 0.6839,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.500544069640914,
174
- "grad_norm": 2.470265064900677,
175
- "learning_rate": 5.834846175204612e-06,
176
- "loss": 0.6657,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.5223068552774756,
181
- "grad_norm": 2.374809615056107,
182
- "learning_rate": 5.456875319914355e-06,
183
- "loss": 0.6616,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.544069640914037,
188
- "grad_norm": 2.546663639426302,
189
- "learning_rate": 5.07624929316463e-06,
190
- "loss": 0.641,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.5658324265505985,
195
- "grad_norm": 2.4876523628247615,
196
- "learning_rate": 4.6951801368822055e-06,
197
- "loss": 0.6407,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.5875952121871599,
202
- "grad_norm": 2.4763298540381666,
203
- "learning_rate": 4.31588246828045e-06,
204
- "loss": 0.6324,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.6093579978237215,
209
- "grad_norm": 2.776813210102937,
210
- "learning_rate": 3.940560609414894e-06,
211
- "loss": 0.6377,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.6311207834602829,
216
- "grad_norm": 2.35611378965622,
217
- "learning_rate": 3.5713957765700224e-06,
218
- "loss": 0.6239,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.6528835690968444,
223
- "grad_norm": 2.2743328506833427,
224
- "learning_rate": 3.2105334039272924e-06,
225
- "loss": 0.5998,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.6746463547334058,
230
- "grad_norm": 2.9956954866670285,
231
- "learning_rate": 2.860070675184036e-06,
232
- "loss": 0.5962,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.6964091403699674,
237
- "grad_norm": 2.595628719529078,
238
- "learning_rate": 2.522044335584322e-06,
239
- "loss": 0.6027,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.7181719260065288,
244
- "grad_norm": 2.671379662704793,
245
- "learning_rate": 2.1984188551932513e-06,
246
- "loss": 0.593,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.7399347116430903,
251
- "grad_norm": 2.31173255024942,
252
- "learning_rate": 1.8910750122048638e-06,
253
- "loss": 0.5826,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.7616974972796517,
258
- "grad_norm": 2.320313991995028,
259
- "learning_rate": 1.601798962632799e-06,
260
- "loss": 0.564,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.7834602829162133,
265
- "grad_norm": 2.3880739142797864,
266
- "learning_rate": 1.3322718599061252e-06,
267
- "loss": 0.5675,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.8052230685527747,
272
- "grad_norm": 2.350078693554661,
273
- "learning_rate": 1.0840600846970333e-06,
274
- "loss": 0.5643,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.8269858541893362,
279
- "grad_norm": 2.2922329677792286,
280
- "learning_rate": 8.586061417605668e-07,
281
- "loss": 0.5686,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 0.8487486398258978,
286
- "grad_norm": 2.2238743664815086,
287
- "learning_rate": 6.572202766902569e-07,
288
- "loss": 0.5542,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 0.8705114254624592,
293
- "grad_norm": 2.453849740180281,
294
- "learning_rate": 4.81072861309591e-07,
295
- "loss": 0.5717,
296
  "step": 200
297
  },
298
  {
299
- "epoch": 0.8922742110990207,
300
- "grad_norm": 2.3379496520696197,
301
- "learning_rate": 3.3118759195232273e-07,
302
- "loss": 0.5505,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 0.9140369967355821,
307
- "grad_norm": 2.3452550178883507,
308
- "learning_rate": 2.0843554016039326e-07,
309
- "loss": 0.5496,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 0.9357997823721437,
314
- "grad_norm": 2.279878770276356,
315
- "learning_rate": 1.1353009037437523e-07,
316
- "loss": 0.5506,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 0.9575625680087051,
321
- "grad_norm": 2.2351766119106924,
322
- "learning_rate": 4.702279403650534e-08,
323
- "loss": 0.5547,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 0.9793253536452666,
328
- "grad_norm": 2.2925674952536825,
329
- "learning_rate": 9.300164200530815e-09,
330
- "loss": 0.5575,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 0.9967355821545157,
335
- "eval_loss": 0.7364564538002014,
336
- "eval_runtime": 106.3104,
337
- "eval_samples_per_second": 3.669,
 
 
 
 
 
 
 
338
  "eval_steps_per_second": 0.922,
339
- "step": 229
340
  },
341
  {
342
- "epoch": 0.9967355821545157,
343
- "step": 229,
344
- "total_flos": 47895596236800.0,
345
- "train_loss": 0.7106172507506792,
346
- "train_runtime": 7254.3633,
347
- "train_samples_per_second": 1.013,
348
  "train_steps_per_second": 0.032
349
  }
350
  ],
351
  "logging_steps": 5,
352
- "max_steps": 229,
353
  "num_input_tokens_seen": 0,
354
  "num_train_epochs": 1,
355
  "save_steps": 100,
@@ -365,7 +372,7 @@
365
  "attributes": {}
366
  }
367
  },
368
- "total_flos": 47895596236800.0,
369
  "train_batch_size": 2,
370
  "trial_name": null,
371
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.997867803837953,
5
  "eval_steps": 500,
6
+ "global_step": 234,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0042643923240938165,
13
+ "grad_norm": 25.654364462455888,
14
+ "learning_rate": 4.1666666666666667e-07,
15
+ "loss": 1.5118,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.021321961620469083,
20
+ "grad_norm": 9.217679826848354,
21
+ "learning_rate": 2.0833333333333334e-06,
22
+ "loss": 1.4588,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.042643923240938165,
27
+ "grad_norm": 3.8364568135386343,
28
+ "learning_rate": 4.166666666666667e-06,
29
+ "loss": 1.1844,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.06396588486140725,
34
+ "grad_norm": 2.7685426084723606,
35
+ "learning_rate": 6.25e-06,
36
+ "loss": 1.0266,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.08528784648187633,
41
+ "grad_norm": 2.714167254334698,
42
+ "learning_rate": 8.333333333333334e-06,
43
+ "loss": 0.9764,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.10660980810234541,
48
+ "grad_norm": 2.575043424767955,
49
+ "learning_rate": 9.999440509051367e-06,
50
+ "loss": 0.9474,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.1279317697228145,
55
+ "grad_norm": 2.5473126644658635,
56
+ "learning_rate": 9.979871469976197e-06,
57
+ "loss": 0.9265,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.14925373134328357,
62
+ "grad_norm": 2.7728522540985927,
63
+ "learning_rate": 9.932452969617607e-06,
64
+ "loss": 0.9103,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.17057569296375266,
69
+ "grad_norm": 2.262656802643975,
70
+ "learning_rate": 9.857450191464337e-06,
71
+ "loss": 0.9089,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.19189765458422176,
76
+ "grad_norm": 2.2929860595064353,
77
+ "learning_rate": 9.755282581475769e-06,
78
+ "loss": 0.8839,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.21321961620469082,
83
+ "grad_norm": 2.9962187125117556,
84
+ "learning_rate": 9.626521502369984e-06,
85
+ "loss": 0.8779,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.2345415778251599,
90
+ "grad_norm": 2.4461853196937744,
91
+ "learning_rate": 9.471887038331686e-06,
92
+ "loss": 0.8655,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.255863539445629,
97
+ "grad_norm": 2.548713200713329,
98
+ "learning_rate": 9.292243968009332e-06,
99
+ "loss": 0.8452,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.2771855010660981,
104
+ "grad_norm": 2.354080355646257,
105
+ "learning_rate": 9.088596928322158e-06,
106
+ "loss": 0.8453,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.29850746268656714,
111
+ "grad_norm": 2.3350186621937494,
112
+ "learning_rate": 8.862084796122998e-06,
113
+ "loss": 0.8213,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.31982942430703626,
118
+ "grad_norm": 2.352888208422696,
119
+ "learning_rate": 8.613974319136959e-06,
120
+ "loss": 0.8087,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.3411513859275053,
125
+ "grad_norm": 2.626490865987853,
126
+ "learning_rate": 8.345653031794292e-06,
127
+ "loss": 0.8,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.3624733475479744,
132
+ "grad_norm": 2.2564126156464934,
133
+ "learning_rate": 8.058621495575032e-06,
134
+ "loss": 0.7883,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.3837953091684435,
139
+ "grad_norm": 2.536678489630529,
140
+ "learning_rate": 7.754484907260513e-06,
141
+ "loss": 0.7797,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.4051172707889126,
146
+ "grad_norm": 2.330261835490306,
147
+ "learning_rate": 7.434944122021837e-06,
148
+ "loss": 0.7704,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.42643923240938164,
153
+ "grad_norm": 2.375473887900136,
154
+ "learning_rate": 7.101786141547829e-06,
155
+ "loss": 0.7491,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.44776119402985076,
160
+ "grad_norm": 2.31845485895562,
161
+ "learning_rate": 6.7568741204067145e-06,
162
+ "loss": 0.7426,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.4690831556503198,
167
+ "grad_norm": 2.2326175780721513,
168
+ "learning_rate": 6.402136946530014e-06,
169
+ "loss": 0.7366,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.4904051172707889,
174
+ "grad_norm": 2.444799836226394,
175
+ "learning_rate": 6.039558454088796e-06,
176
+ "loss": 0.7294,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.511727078891258,
181
+ "grad_norm": 2.42023799653421,
182
+ "learning_rate": 5.671166329088278e-06,
183
+ "loss": 0.7346,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.5330490405117271,
188
+ "grad_norm": 2.525769921790198,
189
+ "learning_rate": 5.299020769725172e-06,
190
+ "loss": 0.716,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.5543710021321961,
195
+ "grad_norm": 2.210624855154462,
196
+ "learning_rate": 4.9252029649236835e-06,
197
+ "loss": 0.7087,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.5756929637526652,
202
+ "grad_norm": 2.260417777455262,
203
+ "learning_rate": 4.551803455482833e-06,
204
+ "loss": 0.6979,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.5970149253731343,
209
+ "grad_norm": 2.5410734519213847,
210
+ "learning_rate": 4.180910442924312e-06,
211
+ "loss": 0.6758,
212
  "step": 140
213
  },
214
  {
215
+ "epoch": 0.6183368869936035,
216
+ "grad_norm": 2.2197214614990983,
217
+ "learning_rate": 3.8145981114225135e-06,
218
+ "loss": 0.6832,
219
  "step": 145
220
  },
221
  {
222
+ "epoch": 0.6396588486140725,
223
+ "grad_norm": 2.417478197312417,
224
+ "learning_rate": 3.4549150281252635e-06,
225
+ "loss": 0.6705,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.6609808102345416,
230
+ "grad_norm": 2.193206874567919,
231
+ "learning_rate": 3.1038726867353587e-06,
232
+ "loss": 0.6909,
233
  "step": 155
234
  },
235
  {
236
+ "epoch": 0.6823027718550106,
237
+ "grad_norm": 2.3141978562259133,
238
+ "learning_rate": 2.7634342584218364e-06,
239
+ "loss": 0.678,
240
  "step": 160
241
  },
242
  {
243
+ "epoch": 0.7036247334754797,
244
+ "grad_norm": 2.20282691421215,
245
+ "learning_rate": 2.43550361297047e-06,
246
+ "loss": 0.6646,
247
  "step": 165
248
  },
249
  {
250
+ "epoch": 0.7249466950959488,
251
+ "grad_norm": 2.3241432733966962,
252
+ "learning_rate": 2.1219146715716332e-06,
253
+ "loss": 0.6633,
254
  "step": 170
255
  },
256
  {
257
+ "epoch": 0.746268656716418,
258
+ "grad_norm": 2.3658483418520464,
259
+ "learning_rate": 1.8244211507891064e-06,
260
+ "loss": 0.6516,
261
  "step": 175
262
  },
263
  {
264
+ "epoch": 0.767590618336887,
265
+ "grad_norm": 2.259696417637488,
266
+ "learning_rate": 1.544686755065677e-06,
267
+ "loss": 0.6418,
268
  "step": 180
269
  },
270
  {
271
+ "epoch": 0.7889125799573561,
272
+ "grad_norm": 2.284368925546414,
273
+ "learning_rate": 1.2842758726130283e-06,
274
+ "loss": 0.6405,
275
  "step": 185
276
  },
277
  {
278
+ "epoch": 0.8102345415778252,
279
+ "grad_norm": 2.2174015564488223,
280
+ "learning_rate": 1.044644826718295e-06,
281
+ "loss": 0.6359,
282
  "step": 190
283
  },
284
  {
285
+ "epoch": 0.8315565031982942,
286
+ "grad_norm": 2.3098966859462076,
287
+ "learning_rate": 8.271337313934869e-07,
288
+ "loss": 0.6232,
289
  "step": 195
290
  },
291
  {
292
+ "epoch": 0.8528784648187633,
293
+ "grad_norm": 2.240425165408693,
294
+ "learning_rate": 6.329589969143518e-07,
295
+ "loss": 0.6263,
296
  "step": 200
297
  },
298
  {
299
+ "epoch": 0.8742004264392325,
300
+ "grad_norm": 2.203409177091297,
301
+ "learning_rate": 4.632065271606756e-07,
302
+ "loss": 0.6299,
303
  "step": 205
304
  },
305
  {
306
+ "epoch": 0.8955223880597015,
307
+ "grad_norm": 2.1702011902470724,
308
+ "learning_rate": 3.18825646801314e-07,
309
+ "loss": 0.636,
310
  "step": 210
311
  },
312
  {
313
+ "epoch": 0.9168443496801706,
314
+ "grad_norm": 2.202446820245564,
315
+ "learning_rate": 2.006237922855553e-07,
316
+ "loss": 0.6182,
317
  "step": 215
318
  },
319
  {
320
+ "epoch": 0.9381663113006397,
321
+ "grad_norm": 2.118840248626809,
322
+ "learning_rate": 1.0926199633097156e-07,
323
+ "loss": 0.609,
324
  "step": 220
325
  },
326
  {
327
+ "epoch": 0.9594882729211087,
328
+ "grad_norm": 2.168175873632397,
329
+ "learning_rate": 4.52511911603265e-08,
330
+ "loss": 0.6173,
331
  "step": 225
332
  },
333
  {
334
+ "epoch": 0.9808102345415778,
335
+ "grad_norm": 2.2624619803066617,
336
+ "learning_rate": 8.949351161324227e-09,
337
+ "loss": 0.6207,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 0.997867803837953,
342
+ "eval_loss": 0.7390011548995972,
343
+ "eval_runtime": 106.273,
344
+ "eval_samples_per_second": 3.67,
345
  "eval_steps_per_second": 0.922,
346
+ "step": 234
347
  },
348
  {
349
+ "epoch": 0.997867803837953,
350
+ "step": 234,
351
+ "total_flos": 48942494515200.0,
352
+ "train_loss": 0.7695284368645432,
353
+ "train_runtime": 7306.9109,
354
+ "train_samples_per_second": 1.026,
355
  "train_steps_per_second": 0.032
356
  }
357
  ],
358
  "logging_steps": 5,
359
+ "max_steps": 234,
360
  "num_input_tokens_seen": 0,
361
  "num_train_epochs": 1,
362
  "save_steps": 100,
 
372
  "attributes": {}
373
  }
374
  },
375
+ "total_flos": 48942494515200.0,
376
  "train_batch_size": 2,
377
  "trial_name": null,
378
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e3bd716872e859fc26cb2d83509c29b1c0e5972c73d760b0ac429e559979835
3
  size 6520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53ce655e7ab1129bcd883d7974bca1f5c5778062e5c8a4eb41ef38796a8ee647
3
  size 6520