Hailey Schoelkopf commited on
Commit
cb26d9b
1 Parent(s): b2bafba

Push to HF hub

Browse files
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint-15000/
2
+ events.out.tfevents.1662816287.vesna.462489.2
3
+ 1662770689.13114/
4
+ checkpoint-5000/
5
+ checkpoint-20000/
6
+ events.out.tfevents.1662770689.vesna.462489.0
7
+ eval_results.json
8
+ word_embeddings_layernorm.pt
9
+ word_embeddings.pt
10
+ train_results.json
11
+ events.out.tfevents.1662862583.vesna.462489.4
12
+ checkpoint-25000/
13
+ all_results.json
14
+ 1662816287.2910337/
15
+ checkpoint-10000/
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bigscience/bloom-560m",
3
+ "adapters": {
4
+ "adapters": {},
5
+ "config_map": {},
6
+ "fusion_config_map": {},
7
+ "fusions": {}
8
+ },
9
+ "apply_residual_connection_post_layernorm": false,
10
+ "architectures": [
11
+ "BloomForCausalLM"
12
+ ],
13
+ "attention_dropout": 0.0,
14
+ "attention_softmax_in_fp32": true,
15
+ "bias_dropout_fusion": true,
16
+ "bos_token_id": 1,
17
+ "eos_token_id": 2,
18
+ "hidden_dropout": 0.0,
19
+ "hidden_size": 1024,
20
+ "initializer_range": 0.02,
21
+ "layer_norm_epsilon": 1e-05,
22
+ "masked_softmax_fusion": true,
23
+ "model_type": "bloom",
24
+ "n_head": 16,
25
+ "n_inner": null,
26
+ "n_layer": 24,
27
+ "offset_alibi": 100,
28
+ "pad_token_id": 3,
29
+ "pretraining_tp": 1,
30
+ "seq_length": 2048,
31
+ "skip_bias_add": true,
32
+ "skip_bias_add_qkv": false,
33
+ "slow_but_exact": false,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.20.0.dev0",
36
+ "unk_token_id": 0,
37
+ "use_cache": true,
38
+ "vocab_size": 250880
39
+ }
pytorch_diff.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c7d018131b8deb6f7bf301f64356da3670bde68a182c0f49f01eff1607ef448
3
+ size 22518333
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c4aadf9c13098a3b1e56255fd62598ff3953b3383bcde7bf9f691563da79d0f
3
+ size 2236955191
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f6efc66e73f1fd69da4f436e48befb519fdff3fe18910850c1d41bd862293a5
3
+ size 14500443
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "add_prefix_space": false, "name_or_path": "bigscience/bloom-560m", "special_tokens_map_file": null, "padding_side": "left", "tokenizer_class": "BloomTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.210977077484131,
3
+ "best_model_checkpoint": "/home/lily/nos6/bloom-adapt/exps_ru/bigscience/bloom-560m_ru_adpt_sft_original-frozen_100_000samples/checkpoint-25000",
4
+ "epoch": 1.462073805485701,
5
+ "global_step": 25000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.03,
12
+ "l1_reg_loss": 0.0,
13
+ "learning_rate": 9.8e-05,
14
+ "loss": 2.8794,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.06,
19
+ "l1_reg_loss": 0.0,
20
+ "learning_rate": 9.6e-05,
21
+ "loss": 2.746,
22
+ "step": 1000
23
+ },
24
+ {
25
+ "epoch": 0.09,
26
+ "l1_reg_loss": 0.0,
27
+ "learning_rate": 9.4e-05,
28
+ "loss": 2.6667,
29
+ "step": 1500
30
+ },
31
+ {
32
+ "epoch": 0.12,
33
+ "l1_reg_loss": 0.0,
34
+ "learning_rate": 9.200000000000001e-05,
35
+ "loss": 2.6093,
36
+ "step": 2000
37
+ },
38
+ {
39
+ "epoch": 0.15,
40
+ "l1_reg_loss": 0.0,
41
+ "learning_rate": 9e-05,
42
+ "loss": 2.5672,
43
+ "step": 2500
44
+ },
45
+ {
46
+ "epoch": 0.18,
47
+ "l1_reg_loss": 0.0,
48
+ "learning_rate": 8.800000000000001e-05,
49
+ "loss": 2.5343,
50
+ "step": 3000
51
+ },
52
+ {
53
+ "epoch": 0.2,
54
+ "l1_reg_loss": 0.0,
55
+ "learning_rate": 8.6e-05,
56
+ "loss": 2.4979,
57
+ "step": 3500
58
+ },
59
+ {
60
+ "epoch": 0.23,
61
+ "l1_reg_loss": 0.0,
62
+ "learning_rate": 8.4e-05,
63
+ "loss": 2.467,
64
+ "step": 4000
65
+ },
66
+ {
67
+ "epoch": 0.26,
68
+ "l1_reg_loss": 0.0,
69
+ "learning_rate": 8.2e-05,
70
+ "loss": 2.4436,
71
+ "step": 4500
72
+ },
73
+ {
74
+ "epoch": 0.29,
75
+ "l1_reg_loss": 0.0,
76
+ "learning_rate": 8e-05,
77
+ "loss": 2.4269,
78
+ "step": 5000
79
+ },
80
+ {
81
+ "epoch": 0.29,
82
+ "eval_loss": 2.4169912338256836,
83
+ "eval_runtime": 487.268,
84
+ "eval_samples_per_second": 13.656,
85
+ "eval_steps_per_second": 3.415,
86
+ "step": 5000
87
+ },
88
+ {
89
+ "epoch": 0.32,
90
+ "l1_reg_loss": 0.0,
91
+ "learning_rate": 7.800000000000001e-05,
92
+ "loss": 2.4069,
93
+ "step": 5500
94
+ },
95
+ {
96
+ "epoch": 0.35,
97
+ "l1_reg_loss": 0.0,
98
+ "learning_rate": 7.6e-05,
99
+ "loss": 2.3837,
100
+ "step": 6000
101
+ },
102
+ {
103
+ "epoch": 0.38,
104
+ "l1_reg_loss": 0.0,
105
+ "learning_rate": 7.4e-05,
106
+ "loss": 2.3675,
107
+ "step": 6500
108
+ },
109
+ {
110
+ "epoch": 0.41,
111
+ "l1_reg_loss": 0.0,
112
+ "learning_rate": 7.2e-05,
113
+ "loss": 2.3656,
114
+ "step": 7000
115
+ },
116
+ {
117
+ "epoch": 0.44,
118
+ "l1_reg_loss": 0.0,
119
+ "learning_rate": 7e-05,
120
+ "loss": 2.3445,
121
+ "step": 7500
122
+ },
123
+ {
124
+ "epoch": 0.47,
125
+ "l1_reg_loss": 0.0,
126
+ "learning_rate": 6.800000000000001e-05,
127
+ "loss": 2.3281,
128
+ "step": 8000
129
+ },
130
+ {
131
+ "epoch": 0.5,
132
+ "l1_reg_loss": 0.0,
133
+ "learning_rate": 6.6e-05,
134
+ "loss": 2.3295,
135
+ "step": 8500
136
+ },
137
+ {
138
+ "epoch": 0.53,
139
+ "l1_reg_loss": 0.0,
140
+ "learning_rate": 6.400000000000001e-05,
141
+ "loss": 2.2996,
142
+ "step": 9000
143
+ },
144
+ {
145
+ "epoch": 0.56,
146
+ "l1_reg_loss": 0.0,
147
+ "learning_rate": 6.2e-05,
148
+ "loss": 2.2968,
149
+ "step": 9500
150
+ },
151
+ {
152
+ "epoch": 0.58,
153
+ "l1_reg_loss": 0.0,
154
+ "learning_rate": 6e-05,
155
+ "loss": 2.2965,
156
+ "step": 10000
157
+ },
158
+ {
159
+ "epoch": 0.58,
160
+ "eval_loss": 2.29951810836792,
161
+ "eval_runtime": 487.1711,
162
+ "eval_samples_per_second": 13.658,
163
+ "eval_steps_per_second": 3.416,
164
+ "step": 10000
165
+ },
166
+ {
167
+ "epoch": 0.61,
168
+ "l1_reg_loss": 0.0,
169
+ "learning_rate": 5.8e-05,
170
+ "loss": 2.288,
171
+ "step": 10500
172
+ },
173
+ {
174
+ "epoch": 0.64,
175
+ "l1_reg_loss": 0.0,
176
+ "learning_rate": 5.6000000000000006e-05,
177
+ "loss": 2.2827,
178
+ "step": 11000
179
+ },
180
+ {
181
+ "epoch": 0.67,
182
+ "l1_reg_loss": 0.0,
183
+ "learning_rate": 5.4000000000000005e-05,
184
+ "loss": 2.2761,
185
+ "step": 11500
186
+ },
187
+ {
188
+ "epoch": 0.7,
189
+ "l1_reg_loss": 0.0,
190
+ "learning_rate": 5.2000000000000004e-05,
191
+ "loss": 2.278,
192
+ "step": 12000
193
+ },
194
+ {
195
+ "epoch": 0.73,
196
+ "l1_reg_loss": 0.0,
197
+ "learning_rate": 5e-05,
198
+ "loss": 2.2664,
199
+ "step": 12500
200
+ },
201
+ {
202
+ "epoch": 0.76,
203
+ "l1_reg_loss": 0.0,
204
+ "learning_rate": 4.8e-05,
205
+ "loss": 2.2475,
206
+ "step": 13000
207
+ },
208
+ {
209
+ "epoch": 0.79,
210
+ "l1_reg_loss": 0.0,
211
+ "learning_rate": 4.600000000000001e-05,
212
+ "loss": 2.2463,
213
+ "step": 13500
214
+ },
215
+ {
216
+ "epoch": 0.82,
217
+ "l1_reg_loss": 0.0,
218
+ "learning_rate": 4.4000000000000006e-05,
219
+ "loss": 2.2492,
220
+ "step": 14000
221
+ },
222
+ {
223
+ "epoch": 0.85,
224
+ "l1_reg_loss": 0.0,
225
+ "learning_rate": 4.2e-05,
226
+ "loss": 2.2555,
227
+ "step": 14500
228
+ },
229
+ {
230
+ "epoch": 0.88,
231
+ "l1_reg_loss": 0.0,
232
+ "learning_rate": 4e-05,
233
+ "loss": 2.2433,
234
+ "step": 15000
235
+ },
236
+ {
237
+ "epoch": 0.88,
238
+ "eval_loss": 2.244656801223755,
239
+ "eval_runtime": 487.1914,
240
+ "eval_samples_per_second": 13.658,
241
+ "eval_steps_per_second": 3.415,
242
+ "step": 15000
243
+ },
244
+ {
245
+ "epoch": 0.91,
246
+ "l1_reg_loss": 0.0,
247
+ "learning_rate": 3.8e-05,
248
+ "loss": 2.2466,
249
+ "step": 15500
250
+ },
251
+ {
252
+ "epoch": 0.94,
253
+ "l1_reg_loss": 0.0,
254
+ "learning_rate": 3.6e-05,
255
+ "loss": 2.2253,
256
+ "step": 16000
257
+ },
258
+ {
259
+ "epoch": 0.96,
260
+ "l1_reg_loss": 0.0,
261
+ "learning_rate": 3.4000000000000007e-05,
262
+ "loss": 2.2238,
263
+ "step": 16500
264
+ },
265
+ {
266
+ "epoch": 0.99,
267
+ "l1_reg_loss": 0.0,
268
+ "learning_rate": 3.2000000000000005e-05,
269
+ "loss": 2.2352,
270
+ "step": 17000
271
+ },
272
+ {
273
+ "epoch": 1.02,
274
+ "l1_reg_loss": 0.0,
275
+ "learning_rate": 3e-05,
276
+ "loss": 2.2222,
277
+ "step": 17500
278
+ },
279
+ {
280
+ "epoch": 1.05,
281
+ "l1_reg_loss": 0.0,
282
+ "learning_rate": 2.8000000000000003e-05,
283
+ "loss": 2.2234,
284
+ "step": 18000
285
+ },
286
+ {
287
+ "epoch": 1.08,
288
+ "l1_reg_loss": 0.0,
289
+ "learning_rate": 2.6000000000000002e-05,
290
+ "loss": 2.2176,
291
+ "step": 18500
292
+ },
293
+ {
294
+ "epoch": 1.11,
295
+ "l1_reg_loss": 0.0,
296
+ "learning_rate": 2.4e-05,
297
+ "loss": 2.2084,
298
+ "step": 19000
299
+ },
300
+ {
301
+ "epoch": 1.14,
302
+ "l1_reg_loss": 0.0,
303
+ "learning_rate": 2.2000000000000003e-05,
304
+ "loss": 2.2111,
305
+ "step": 19500
306
+ },
307
+ {
308
+ "epoch": 1.17,
309
+ "l1_reg_loss": 0.0,
310
+ "learning_rate": 2e-05,
311
+ "loss": 2.2014,
312
+ "step": 20000
313
+ },
314
+ {
315
+ "epoch": 1.17,
316
+ "eval_loss": 2.219003677368164,
317
+ "eval_runtime": 487.1412,
318
+ "eval_samples_per_second": 13.659,
319
+ "eval_steps_per_second": 3.416,
320
+ "step": 20000
321
+ },
322
+ {
323
+ "epoch": 1.2,
324
+ "l1_reg_loss": 0.0,
325
+ "learning_rate": 1.8e-05,
326
+ "loss": 2.2114,
327
+ "step": 20500
328
+ },
329
+ {
330
+ "epoch": 1.23,
331
+ "l1_reg_loss": 0.0,
332
+ "learning_rate": 1.6000000000000003e-05,
333
+ "loss": 2.2049,
334
+ "step": 21000
335
+ },
336
+ {
337
+ "epoch": 1.26,
338
+ "l1_reg_loss": 0.0,
339
+ "learning_rate": 1.4000000000000001e-05,
340
+ "loss": 2.2104,
341
+ "step": 21500
342
+ },
343
+ {
344
+ "epoch": 1.29,
345
+ "l1_reg_loss": 0.0,
346
+ "learning_rate": 1.2e-05,
347
+ "loss": 2.2046,
348
+ "step": 22000
349
+ },
350
+ {
351
+ "epoch": 1.32,
352
+ "l1_reg_loss": 0.0,
353
+ "learning_rate": 1e-05,
354
+ "loss": 2.1999,
355
+ "step": 22500
356
+ },
357
+ {
358
+ "epoch": 1.35,
359
+ "l1_reg_loss": 0.0,
360
+ "learning_rate": 8.000000000000001e-06,
361
+ "loss": 2.2012,
362
+ "step": 23000
363
+ },
364
+ {
365
+ "epoch": 1.37,
366
+ "l1_reg_loss": 0.0,
367
+ "learning_rate": 6e-06,
368
+ "loss": 2.1966,
369
+ "step": 23500
370
+ },
371
+ {
372
+ "epoch": 1.4,
373
+ "l1_reg_loss": 0.0,
374
+ "learning_rate": 4.000000000000001e-06,
375
+ "loss": 2.201,
376
+ "step": 24000
377
+ },
378
+ {
379
+ "epoch": 1.43,
380
+ "l1_reg_loss": 0.0,
381
+ "learning_rate": 2.0000000000000003e-06,
382
+ "loss": 2.1975,
383
+ "step": 24500
384
+ },
385
+ {
386
+ "epoch": 1.46,
387
+ "l1_reg_loss": 0.0,
388
+ "learning_rate": 0.0,
389
+ "loss": 2.1981,
390
+ "step": 25000
391
+ },
392
+ {
393
+ "epoch": 1.46,
394
+ "eval_loss": 2.210977077484131,
395
+ "eval_runtime": 487.0859,
396
+ "eval_samples_per_second": 13.661,
397
+ "eval_steps_per_second": 3.416,
398
+ "step": 25000
399
+ },
400
+ {
401
+ "epoch": 1.46,
402
+ "step": 25000,
403
+ "total_flos": 3.7147907956565606e+17,
404
+ "train_loss": 2.32260580078125,
405
+ "train_runtime": 45771.5275,
406
+ "train_samples_per_second": 4.37,
407
+ "train_steps_per_second": 0.546
408
+ }
409
+ ],
410
+ "max_steps": 25000,
411
+ "num_train_epochs": 2,
412
+ "total_flos": 3.7147907956565606e+17,
413
+ "trial_name": null,
414
+ "trial_params": null
415
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac9ab19482f2bafccf80ae6224969a3ccb78a1a95553a600eac6889ceb0b77a1
3
+ size 3375