ndaheim commited on
Commit
ad6c1a6
1 Parent(s): 9fe0f9d

initial model

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<Hint/Information_Reveal>": 50267, "<Correction>": 50266, "<user>": 50273, "<Other>": 50268, "<knowledge_tag>": 50272, "<agent>": 50270, "<knowledge_sep>": 50271, "<Question>": 50269, "<Confirmation>": 50265}
config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 2,
27
+ "forced_bos_token_id": 0,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_length": 60,
43
+ "max_position_embeddings": 1024,
44
+ "model_type": "bart",
45
+ "no_repeat_ngram_size": 3,
46
+ "normalize_before": false,
47
+ "normalize_embedding": true,
48
+ "num_beams": 10,
49
+ "num_hidden_layers": 6,
50
+ "pad_token_id": 1,
51
+ "scale_embedding": false,
52
+ "task_specific_params": {
53
+ "summarization": {
54
+ "length_penalty": 1.0,
55
+ "max_length": 128,
56
+ "min_length": 12,
57
+ "num_beams": 4
58
+ },
59
+ "summarization_cnn": {
60
+ "length_penalty": 2.0,
61
+ "max_length": 142,
62
+ "min_length": 56,
63
+ "num_beams": 4
64
+ },
65
+ "summarization_xsum": {
66
+ "length_penalty": 1.0,
67
+ "max_length": 62,
68
+ "min_length": 11,
69
+ "num_beams": 6
70
+ }
71
+ },
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.9.0",
74
+ "uid_regularization": 0.0,
75
+ "use_cache": true,
76
+ "vocab_size": 50274
77
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e282ca7c67b2f0a7a2ed1e93d7ee48848e6c8a737a982140071e8d4c343224d6
3
+ size 1115581221
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c18d8c21a5bec64b6b9a3f7e9101ff956e9340ebf8201779079425aa1f21a669
3
+ size 558013395
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d108da5ef66331f937797906f625c8846f74761e2da63219dc1978627b54d4b3
3
+ size 14657
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb9bdc4c1c89f937942a1165786f8e682f0330541a622942e057de47622afdf
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}, "additional_special_tokens": ["<Confirmation>", "<Correction>", "<Hint/Information_Reveal>", "<Other>", "<Question>", "<agent>", "<knowledge_sep>", "<knowledge_tag>", "<user>"]}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "facebook/bart-base", "tokenizer_class": "BartTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 16.99889502762431,
5
+ "global_step": 5763,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.38,
12
+ "gpu_memory": 2987030016,
13
+ "learning_rate": 8.32e-06,
14
+ "loss": 4.0407,
15
+ "step": 128
16
+ },
17
+ {
18
+ "epoch": 0.75,
19
+ "gpu_memory": 3076460544,
20
+ "learning_rate": 1.664e-05,
21
+ "loss": 2.405,
22
+ "step": 256
23
+ },
24
+ {
25
+ "epoch": 1.0,
26
+ "eval_bp": 0.035349686560536234,
27
+ "eval_counts": [
28
+ 505,
29
+ 125,
30
+ 50,
31
+ 11
32
+ ],
33
+ "eval_loss": 1.9292821884155273,
34
+ "eval_precisions": [
35
+ 46.118721461187214,
36
+ 15.723270440251572,
37
+ 9.861932938856016,
38
+ 4.471544715447155
39
+ ],
40
+ "eval_ref_len": 4755,
41
+ "eval_runtime": 44.1807,
42
+ "eval_samples_per_second": 6.79,
43
+ "eval_score": 0.47271078280719403,
44
+ "eval_steps_per_second": 6.79,
45
+ "eval_sys_len": 1095,
46
+ "eval_totals": [
47
+ 1095,
48
+ 795,
49
+ 507,
50
+ 246
51
+ ],
52
+ "gpu_memory": 3076460544,
53
+ "step": 339
54
+ },
55
+ {
56
+ "epoch": 1.13,
57
+ "gpu_memory": 3076460544,
58
+ "learning_rate": 2.4959999999999998e-05,
59
+ "loss": 2.0089,
60
+ "step": 384
61
+ },
62
+ {
63
+ "epoch": 1.51,
64
+ "gpu_memory": 3076460544,
65
+ "learning_rate": 3.2437898089171974e-05,
66
+ "loss": 1.8155,
67
+ "step": 512
68
+ },
69
+ {
70
+ "epoch": 1.89,
71
+ "gpu_memory": 3076460544,
72
+ "learning_rate": 3.1775477707006364e-05,
73
+ "loss": 1.7234,
74
+ "step": 640
75
+ },
76
+ {
77
+ "epoch": 2.0,
78
+ "eval_bp": 0.0840891954437523,
79
+ "eval_counts": [
80
+ 492,
81
+ 189,
82
+ 85,
83
+ 29
84
+ ],
85
+ "eval_loss": 1.6681220531463623,
86
+ "eval_precisions": [
87
+ 35.96491228070175,
88
+ 17.696629213483146,
89
+ 10.303030303030303,
90
+ 4.833333333333333
91
+ ],
92
+ "eval_ref_len": 4755,
93
+ "eval_runtime": 53.1682,
94
+ "eval_samples_per_second": 5.642,
95
+ "eval_score": 1.1219810390322362,
96
+ "eval_steps_per_second": 5.642,
97
+ "eval_sys_len": 1368,
98
+ "eval_totals": [
99
+ 1368,
100
+ 1068,
101
+ 825,
102
+ 600
103
+ ],
104
+ "gpu_memory": 3076460544,
105
+ "step": 678
106
+ },
107
+ {
108
+ "epoch": 2.27,
109
+ "gpu_memory": 3076460544,
110
+ "learning_rate": 3.111305732484076e-05,
111
+ "loss": 1.6058,
112
+ "step": 768
113
+ },
114
+ {
115
+ "epoch": 2.64,
116
+ "gpu_memory": 3076460544,
117
+ "learning_rate": 3.0450636942675155e-05,
118
+ "loss": 1.5189,
119
+ "step": 896
120
+ },
121
+ {
122
+ "epoch": 3.0,
123
+ "eval_bp": 0.09192776836698148,
124
+ "eval_counts": [
125
+ 571,
126
+ 192,
127
+ 93,
128
+ 40
129
+ ],
130
+ "eval_loss": 1.5985139608383179,
131
+ "eval_precisions": [
132
+ 40.66951566951567,
133
+ 17.391304347826086,
134
+ 11.03202846975089,
135
+ 6.734006734006734
136
+ ],
137
+ "eval_ref_len": 4755,
138
+ "eval_runtime": 44.5545,
139
+ "eval_samples_per_second": 6.733,
140
+ "eval_score": 1.391807704814939,
141
+ "eval_steps_per_second": 6.733,
142
+ "eval_sys_len": 1404,
143
+ "eval_totals": [
144
+ 1404,
145
+ 1104,
146
+ 843,
147
+ 594
148
+ ],
149
+ "gpu_memory": 3076460544,
150
+ "step": 1017
151
+ },
152
+ {
153
+ "epoch": 3.02,
154
+ "gpu_memory": 3076460544,
155
+ "learning_rate": 2.9788216560509553e-05,
156
+ "loss": 1.4885,
157
+ "step": 1024
158
+ },
159
+ {
160
+ "epoch": 3.4,
161
+ "gpu_memory": 3076460544,
162
+ "learning_rate": 2.9125796178343946e-05,
163
+ "loss": 1.334,
164
+ "step": 1152
165
+ },
166
+ {
167
+ "epoch": 3.77,
168
+ "gpu_memory": 3076460544,
169
+ "learning_rate": 2.8463375796178344e-05,
170
+ "loss": 1.3861,
171
+ "step": 1280
172
+ },
173
+ {
174
+ "epoch": 4.0,
175
+ "eval_bp": 0.034513967404432855,
176
+ "eval_counts": [
177
+ 432,
178
+ 173,
179
+ 84,
180
+ 35
181
+ ],
182
+ "eval_loss": 1.6043497323989868,
183
+ "eval_precisions": [
184
+ 39.66942148760331,
185
+ 21.926489226869457,
186
+ 16.184971098265898,
187
+ 9.48509485094851
188
+ ],
189
+ "eval_ref_len": 4755,
190
+ "eval_runtime": 42.8527,
191
+ "eval_samples_per_second": 7.001,
192
+ "eval_score": 0.6597653875525311,
193
+ "eval_steps_per_second": 7.001,
194
+ "eval_sys_len": 1089,
195
+ "eval_totals": [
196
+ 1089,
197
+ 789,
198
+ 519,
199
+ 369
200
+ ],
201
+ "gpu_memory": 3076460544,
202
+ "step": 1356
203
+ },
204
+ {
205
+ "epoch": 4.15,
206
+ "gpu_memory": 3076460544,
207
+ "learning_rate": 2.7800955414012737e-05,
208
+ "loss": 1.3367,
209
+ "step": 1408
210
+ },
211
+ {
212
+ "epoch": 4.53,
213
+ "gpu_memory": 3076460544,
214
+ "learning_rate": 2.713853503184713e-05,
215
+ "loss": 1.2828,
216
+ "step": 1536
217
+ },
218
+ {
219
+ "epoch": 4.91,
220
+ "gpu_memory": 3076460544,
221
+ "learning_rate": 2.647611464968153e-05,
222
+ "loss": 1.2571,
223
+ "step": 1664
224
+ },
225
+ {
226
+ "epoch": 5.0,
227
+ "eval_bp": 0.17929973112718744,
228
+ "eval_counts": [
229
+ 671,
230
+ 230,
231
+ 102,
232
+ 43
233
+ ],
234
+ "eval_loss": 1.5908681154251099,
235
+ "eval_precisions": [
236
+ 38.36477987421384,
237
+ 15.873015873015873,
238
+ 8.695652173913043,
239
+ 4.699453551912568
240
+ ],
241
+ "eval_ref_len": 4755,
242
+ "eval_runtime": 55.6404,
243
+ "eval_samples_per_second": 5.392,
244
+ "eval_score": 2.2519827467510987,
245
+ "eval_steps_per_second": 5.392,
246
+ "eval_sys_len": 1749,
247
+ "eval_totals": [
248
+ 1749,
249
+ 1449,
250
+ 1173,
251
+ 915
252
+ ],
253
+ "gpu_memory": 3076460544,
254
+ "step": 1695
255
+ },
256
+ {
257
+ "epoch": 5.29,
258
+ "gpu_memory": 3076460544,
259
+ "learning_rate": 2.5813694267515922e-05,
260
+ "loss": 1.2035,
261
+ "step": 1792
262
+ },
263
+ {
264
+ "epoch": 5.66,
265
+ "gpu_memory": 3076460544,
266
+ "learning_rate": 2.515127388535032e-05,
267
+ "loss": 1.183,
268
+ "step": 1920
269
+ },
270
+ {
271
+ "epoch": 6.0,
272
+ "eval_bp": 0.07050485313640832,
273
+ "eval_counts": [
274
+ 615,
275
+ 257,
276
+ 141,
277
+ 80
278
+ ],
279
+ "eval_loss": 1.5943706035614014,
280
+ "eval_precisions": [
281
+ 47.235023041474655,
282
+ 25.64870259481038,
283
+ 19.502074688796682,
284
+ 14.109347442680775
285
+ ],
286
+ "eval_ref_len": 4755,
287
+ "eval_runtime": 47.1825,
288
+ "eval_samples_per_second": 6.358,
289
+ "eval_score": 1.6941362350992444,
290
+ "eval_steps_per_second": 6.358,
291
+ "eval_sys_len": 1302,
292
+ "eval_totals": [
293
+ 1302,
294
+ 1002,
295
+ 723,
296
+ 567
297
+ ],
298
+ "gpu_memory": 3076460544,
299
+ "step": 2034
300
+ },
301
+ {
302
+ "epoch": 6.04,
303
+ "gpu_memory": 3076460544,
304
+ "learning_rate": 2.4488853503184713e-05,
305
+ "loss": 1.1964,
306
+ "step": 2048
307
+ },
308
+ {
309
+ "epoch": 6.42,
310
+ "gpu_memory": 3076460544,
311
+ "learning_rate": 2.3826433121019104e-05,
312
+ "loss": 1.1073,
313
+ "step": 2176
314
+ },
315
+ {
316
+ "epoch": 6.8,
317
+ "gpu_memory": 3076460544,
318
+ "learning_rate": 2.31640127388535e-05,
319
+ "loss": 1.1316,
320
+ "step": 2304
321
+ },
322
+ {
323
+ "epoch": 7.0,
324
+ "eval_bp": 0.10421315891869368,
325
+ "eval_counts": [
326
+ 649,
327
+ 197,
328
+ 79,
329
+ 22
330
+ ],
331
+ "eval_loss": 1.6070951223373413,
332
+ "eval_precisions": [
333
+ 44.51303155006859,
334
+ 17.012089810017272,
335
+ 9.111880046136102,
336
+ 3.559870550161812
337
+ ],
338
+ "eval_ref_len": 4755,
339
+ "eval_runtime": 47.6479,
340
+ "eval_samples_per_second": 6.296,
341
+ "eval_score": 1.3046509061748794,
342
+ "eval_steps_per_second": 6.296,
343
+ "eval_sys_len": 1458,
344
+ "eval_totals": [
345
+ 1458,
346
+ 1158,
347
+ 867,
348
+ 618
349
+ ],
350
+ "gpu_memory": 3076460544,
351
+ "step": 2373
352
+ },
353
+ {
354
+ "epoch": 7.17,
355
+ "gpu_memory": 3076460544,
356
+ "learning_rate": 2.2501592356687895e-05,
357
+ "loss": 1.0398,
358
+ "step": 2432
359
+ },
360
+ {
361
+ "epoch": 7.55,
362
+ "gpu_memory": 3076460544,
363
+ "learning_rate": 2.183917197452229e-05,
364
+ "loss": 1.0349,
365
+ "step": 2560
366
+ },
367
+ {
368
+ "epoch": 7.93,
369
+ "gpu_memory": 3076460544,
370
+ "learning_rate": 2.1176751592356686e-05,
371
+ "loss": 1.0816,
372
+ "step": 2688
373
+ },
374
+ {
375
+ "epoch": 8.0,
376
+ "eval_bp": 0.21001389512353258,
377
+ "eval_counts": [
378
+ 846,
379
+ 344,
380
+ 187,
381
+ 105
382
+ ],
383
+ "eval_loss": 1.6298103332519531,
384
+ "eval_precisions": [
385
+ 45.55735056542811,
386
+ 22.093770070648684,
387
+ 14.597970335675253,
388
+ 10.294117647058824
389
+ ],
390
+ "eval_ref_len": 4755,
391
+ "eval_runtime": 54.6716,
392
+ "eval_samples_per_second": 5.487,
393
+ "eval_score": 4.141670104799348,
394
+ "eval_steps_per_second": 5.487,
395
+ "eval_sys_len": 1857,
396
+ "eval_totals": [
397
+ 1857,
398
+ 1557,
399
+ 1281,
400
+ 1020
401
+ ],
402
+ "gpu_memory": 3076460544,
403
+ "step": 2712
404
+ },
405
+ {
406
+ "epoch": 8.31,
407
+ "gpu_memory": 3076460544,
408
+ "learning_rate": 2.051433121019108e-05,
409
+ "loss": 0.987,
410
+ "step": 2816
411
+ },
412
+ {
413
+ "epoch": 8.68,
414
+ "gpu_memory": 3076460544,
415
+ "learning_rate": 1.9851910828025477e-05,
416
+ "loss": 0.9829,
417
+ "step": 2944
418
+ },
419
+ {
420
+ "epoch": 9.0,
421
+ "eval_bp": 0.06525766524199453,
422
+ "eval_counts": [
423
+ 577,
424
+ 216,
425
+ 100,
426
+ 37
427
+ ],
428
+ "eval_loss": 1.6366333961486816,
429
+ "eval_precisions": [
430
+ 45.254901960784316,
431
+ 22.153846153846153,
432
+ 14.367816091954023,
433
+ 7.297830374753452
434
+ ],
435
+ "eval_ref_len": 4755,
436
+ "eval_runtime": 49.3567,
437
+ "eval_samples_per_second": 6.078,
438
+ "eval_score": 1.1750500193614282,
439
+ "eval_steps_per_second": 6.078,
440
+ "eval_sys_len": 1275,
441
+ "eval_totals": [
442
+ 1275,
443
+ 975,
444
+ 696,
445
+ 507
446
+ ],
447
+ "gpu_memory": 3076460544,
448
+ "step": 3051
449
+ },
450
+ {
451
+ "epoch": 9.06,
452
+ "gpu_memory": 3076460544,
453
+ "learning_rate": 1.918949044585987e-05,
454
+ "loss": 1.003,
455
+ "step": 3072
456
+ },
457
+ {
458
+ "epoch": 9.44,
459
+ "gpu_memory": 3076460544,
460
+ "learning_rate": 1.8527070063694264e-05,
461
+ "loss": 0.9337,
462
+ "step": 3200
463
+ },
464
+ {
465
+ "epoch": 9.82,
466
+ "gpu_memory": 3076460544,
467
+ "learning_rate": 1.786464968152866e-05,
468
+ "loss": 0.9325,
469
+ "step": 3328
470
+ },
471
+ {
472
+ "epoch": 10.0,
473
+ "eval_bp": 0.16851984622310243,
474
+ "eval_counts": [
475
+ 667,
476
+ 248,
477
+ 121,
478
+ 62
479
+ ],
480
+ "eval_loss": 1.67235267162323,
481
+ "eval_precisions": [
482
+ 39.005847953216374,
483
+ 17.588652482269502,
484
+ 10.503472222222221,
485
+ 6.68824163969795
486
+ ],
487
+ "eval_ref_len": 4755,
488
+ "eval_runtime": 50.0923,
489
+ "eval_samples_per_second": 5.989,
490
+ "eval_score": 2.4969097127652855,
491
+ "eval_steps_per_second": 5.989,
492
+ "eval_sys_len": 1710,
493
+ "eval_totals": [
494
+ 1710,
495
+ 1410,
496
+ 1152,
497
+ 927
498
+ ],
499
+ "gpu_memory": 3076460544,
500
+ "step": 3390
501
+ },
502
+ {
503
+ "epoch": 10.19,
504
+ "gpu_memory": 3076460544,
505
+ "learning_rate": 1.7202229299363055e-05,
506
+ "loss": 0.9075,
507
+ "step": 3456
508
+ },
509
+ {
510
+ "epoch": 10.57,
511
+ "gpu_memory": 3076460544,
512
+ "learning_rate": 1.6539808917197452e-05,
513
+ "loss": 0.8753,
514
+ "step": 3584
515
+ },
516
+ {
517
+ "epoch": 10.95,
518
+ "gpu_memory": 3076460544,
519
+ "learning_rate": 1.5877388535031846e-05,
520
+ "loss": 0.9098,
521
+ "step": 3712
522
+ },
523
+ {
524
+ "epoch": 11.0,
525
+ "eval_bp": 0.1483387334695538,
526
+ "eval_counts": [
527
+ 735,
528
+ 268,
529
+ 134,
530
+ 67
531
+ ],
532
+ "eval_loss": 1.6972090005874634,
533
+ "eval_precisions": [
534
+ 44.95412844036697,
535
+ 20.074906367041198,
536
+ 12.725546058879392,
537
+ 8.18070818070818
538
+ ],
539
+ "eval_ref_len": 4755,
540
+ "eval_runtime": 46.9235,
541
+ "eval_samples_per_second": 6.393,
542
+ "eval_score": 2.5970312545681904,
543
+ "eval_steps_per_second": 6.393,
544
+ "eval_sys_len": 1635,
545
+ "eval_totals": [
546
+ 1635,
547
+ 1335,
548
+ 1053,
549
+ 819
550
+ ],
551
+ "gpu_memory": 3076460544,
552
+ "step": 3729
553
+ },
554
+ {
555
+ "epoch": 11.33,
556
+ "gpu_memory": 3076460544,
557
+ "learning_rate": 1.5214968152866242e-05,
558
+ "loss": 0.839,
559
+ "step": 3840
560
+ },
561
+ {
562
+ "epoch": 11.7,
563
+ "gpu_memory": 3076460544,
564
+ "learning_rate": 1.4552547770700635e-05,
565
+ "loss": 0.8643,
566
+ "step": 3968
567
+ },
568
+ {
569
+ "epoch": 12.0,
570
+ "eval_bp": 0.1320190352563076,
571
+ "eval_counts": [
572
+ 715,
573
+ 285,
574
+ 143,
575
+ 70
576
+ ],
577
+ "eval_loss": 1.713928461074829,
578
+ "eval_precisions": [
579
+ 45.48346055979644,
580
+ 22.40566037735849,
581
+ 14.357429718875501,
582
+ 9.25925925925926
583
+ ],
584
+ "eval_ref_len": 4755,
585
+ "eval_runtime": 46.2792,
586
+ "eval_samples_per_second": 6.482,
587
+ "eval_score": 2.532809945547002,
588
+ "eval_steps_per_second": 6.482,
589
+ "eval_sys_len": 1572,
590
+ "eval_totals": [
591
+ 1572,
592
+ 1272,
593
+ 996,
594
+ 756
595
+ ],
596
+ "gpu_memory": 3076460544,
597
+ "step": 4068
598
+ },
599
+ {
600
+ "epoch": 12.08,
601
+ "gpu_memory": 3076460544,
602
+ "learning_rate": 1.3890127388535031e-05,
603
+ "loss": 0.8264,
604
+ "step": 4096
605
+ },
606
+ {
607
+ "epoch": 12.46,
608
+ "gpu_memory": 3076460544,
609
+ "learning_rate": 1.3227707006369426e-05,
610
+ "loss": 0.8008,
611
+ "step": 4224
612
+ },
613
+ {
614
+ "epoch": 12.84,
615
+ "gpu_memory": 3076460544,
616
+ "learning_rate": 1.2565286624203822e-05,
617
+ "loss": 0.7963,
618
+ "step": 4352
619
+ },
620
+ {
621
+ "epoch": 13.0,
622
+ "eval_bp": 0.18517745860640325,
623
+ "eval_counts": [
624
+ 782,
625
+ 310,
626
+ 160,
627
+ 79
628
+ ],
629
+ "eval_loss": 1.7276182174682617,
630
+ "eval_precisions": [
631
+ 44.18079096045198,
632
+ 21.08843537414966,
633
+ 13.43408900083963,
634
+ 8.44017094017094
635
+ ],
636
+ "eval_ref_len": 4755,
637
+ "eval_runtime": 49.531,
638
+ "eval_samples_per_second": 6.057,
639
+ "eval_score": 3.3384697611529055,
640
+ "eval_steps_per_second": 6.057,
641
+ "eval_sys_len": 1770,
642
+ "eval_totals": [
643
+ 1770,
644
+ 1470,
645
+ 1191,
646
+ 936
647
+ ],
648
+ "gpu_memory": 3076460544,
649
+ "step": 4407
650
+ },
651
+ {
652
+ "epoch": 13.22,
653
+ "gpu_memory": 3076460544,
654
+ "learning_rate": 1.1902866242038214e-05,
655
+ "loss": 0.791,
656
+ "step": 4480
657
+ },
658
+ {
659
+ "epoch": 13.59,
660
+ "gpu_memory": 3076460544,
661
+ "learning_rate": 1.124044585987261e-05,
662
+ "loss": 0.7591,
663
+ "step": 4608
664
+ },
665
+ {
666
+ "epoch": 13.97,
667
+ "gpu_memory": 3076460544,
668
+ "learning_rate": 1.0578025477707005e-05,
669
+ "loss": 0.7651,
670
+ "step": 4736
671
+ },
672
+ {
673
+ "epoch": 14.0,
674
+ "eval_bp": 0.17762954994257873,
675
+ "eval_counts": [
676
+ 784,
677
+ 310,
678
+ 160,
679
+ 81
680
+ ],
681
+ "eval_loss": 1.788110375404358,
682
+ "eval_precisions": [
683
+ 44.97991967871486,
684
+ 21.48302148302148,
685
+ 13.605442176870747,
686
+ 8.653846153846153
687
+ ],
688
+ "eval_ref_len": 4755,
689
+ "eval_runtime": 48.2995,
690
+ "eval_samples_per_second": 6.211,
691
+ "eval_score": 3.262302153360586,
692
+ "eval_steps_per_second": 6.211,
693
+ "eval_sys_len": 1743,
694
+ "eval_totals": [
695
+ 1743,
696
+ 1443,
697
+ 1176,
698
+ 936
699
+ ],
700
+ "gpu_memory": 3076460544,
701
+ "step": 4746
702
+ },
703
+ {
704
+ "epoch": 14.35,
705
+ "gpu_memory": 3076460544,
706
+ "learning_rate": 9.9156050955414e-06,
707
+ "loss": 0.7389,
708
+ "step": 4864
709
+ },
710
+ {
711
+ "epoch": 14.72,
712
+ "gpu_memory": 3076460544,
713
+ "learning_rate": 9.253184713375794e-06,
714
+ "loss": 0.7292,
715
+ "step": 4992
716
+ },
717
+ {
718
+ "epoch": 15.0,
719
+ "eval_bp": 0.19451009506119815,
720
+ "eval_counts": [
721
+ 756,
722
+ 286,
723
+ 139,
724
+ 66
725
+ ],
726
+ "eval_loss": 1.8334678411483765,
727
+ "eval_precisions": [
728
+ 41.930116472545755,
729
+ 19.028609447771125,
730
+ 11.356209150326798,
731
+ 6.790123456790123
732
+ ],
733
+ "eval_ref_len": 4755,
734
+ "eval_runtime": 50.9389,
735
+ "eval_samples_per_second": 5.889,
736
+ "eval_score": 3.063396343878355,
737
+ "eval_steps_per_second": 5.889,
738
+ "eval_sys_len": 1803,
739
+ "eval_totals": [
740
+ 1803,
741
+ 1503,
742
+ 1224,
743
+ 972
744
+ ],
745
+ "gpu_memory": 3076460544,
746
+ "step": 5085
747
+ },
748
+ {
749
+ "epoch": 15.1,
750
+ "gpu_memory": 3076460544,
751
+ "learning_rate": 8.59076433121019e-06,
752
+ "loss": 0.7051,
753
+ "step": 5120
754
+ },
755
+ {
756
+ "epoch": 15.48,
757
+ "gpu_memory": 3076460544,
758
+ "learning_rate": 7.928343949044585e-06,
759
+ "loss": 0.6872,
760
+ "step": 5248
761
+ },
762
+ {
763
+ "epoch": 15.86,
764
+ "gpu_memory": 3076460544,
765
+ "learning_rate": 7.265923566878981e-06,
766
+ "loss": 0.6935,
767
+ "step": 5376
768
+ },
769
+ {
770
+ "epoch": 16.0,
771
+ "eval_bp": 0.2204937574447589,
772
+ "eval_counts": [
773
+ 792,
774
+ 311,
775
+ 160,
776
+ 80
777
+ ],
778
+ "eval_loss": 1.8358988761901855,
779
+ "eval_precisions": [
780
+ 41.83835182250396,
781
+ 19.522912743251727,
782
+ 12.121212121212121,
783
+ 7.469654528478058
784
+ ],
785
+ "eval_ref_len": 4755,
786
+ "eval_runtime": 51.7295,
787
+ "eval_samples_per_second": 5.799,
788
+ "eval_score": 3.6361160482722528,
789
+ "eval_steps_per_second": 5.799,
790
+ "eval_sys_len": 1893,
791
+ "eval_totals": [
792
+ 1893,
793
+ 1593,
794
+ 1320,
795
+ 1071
796
+ ],
797
+ "gpu_memory": 3076460544,
798
+ "step": 5424
799
+ },
800
+ {
801
+ "epoch": 16.24,
802
+ "gpu_memory": 3076460544,
803
+ "learning_rate": 6.6035031847133755e-06,
804
+ "loss": 0.6808,
805
+ "step": 5504
806
+ },
807
+ {
808
+ "epoch": 16.61,
809
+ "gpu_memory": 3076460544,
810
+ "learning_rate": 5.94108280254777e-06,
811
+ "loss": 0.6649,
812
+ "step": 5632
813
+ },
814
+ {
815
+ "epoch": 16.99,
816
+ "gpu_memory": 3076460544,
817
+ "learning_rate": 5.278662420382165e-06,
818
+ "loss": 0.6902,
819
+ "step": 5760
820
+ },
821
+ {
822
+ "epoch": 17.0,
823
+ "eval_bp": 0.27059488659440983,
824
+ "eval_counts": [
825
+ 875,
826
+ 346,
827
+ 196,
828
+ 113
829
+ ],
830
+ "eval_loss": 1.8474984169006348,
831
+ "eval_precisions": [
832
+ 42.45511887433285,
833
+ 19.64792731402612,
834
+ 13.198653198653199,
835
+ 9.254709254709255
836
+ ],
837
+ "eval_ref_len": 4755,
838
+ "eval_runtime": 53.1106,
839
+ "eval_samples_per_second": 5.649,
840
+ "eval_score": 4.834531406134382,
841
+ "eval_steps_per_second": 5.649,
842
+ "eval_sys_len": 2061,
843
+ "eval_totals": [
844
+ 2061,
845
+ 1761,
846
+ 1485,
847
+ 1221
848
+ ],
849
+ "gpu_memory": 3076460544,
850
+ "step": 5763
851
+ }
852
+ ],
853
+ "max_steps": 6780,
854
+ "num_train_epochs": 20,
855
+ "total_flos": 5005888091043840.0,
856
+ "trial_name": null,
857
+ "trial_params": null
858
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6d0dd5195a5c72eed3cf01ad4025ca6a983050b6d3743b3153b17a436650aa6
3
+ size 2927
vocab.json ADDED
The diff for this file is too large to render. See raw diff