Marvin commited on
Commit
8f02e9b
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.model filter=lfs diff=lfs merge=lfs -text
3
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
4
+ runs/** filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - de
4
+ tags:
5
+ - question-generation
6
+ - german
7
+ - text2text-generation
8
+ - generated_from_trainer
9
+ datasets:
10
+ - lmqg/qg_dequad
11
+ metrics:
12
+ - bleu4
13
+ - f1
14
+ - rouge
15
+ - exact_match
16
+ model-index:
17
+ - name: german-jeopardy-mt5-large
18
+ results:
19
+ - task:
20
+ name: Sequence-to-sequence Language Modeling
21
+ type: text2text-generation
22
+ dataset:
23
+ name: lmqg/qg_dequad
24
+ type: default
25
+ args: default
26
+ metrics:
27
+ - name: BLEU-4
28
+ type: bleu4
29
+ value: 15.09
30
+ - name: F1
31
+ type: f1
32
+ value: 40.69
33
+ - name: ROUGE-1
34
+ type: rouge1
35
+ value: 41.68
36
+ - name: ROUGE-2
37
+ type: rouge2
38
+ value: 22.07
39
+ - name: ROUGE-L
40
+ type: rougel
41
+ value: 40.20
42
+ - name: ROUGE-Lsum
43
+ type: rougelsum
44
+ value: 40.19
45
+ - name: Exact Match
46
+ type: exact_match
47
+ value: 2.77
48
+ ---
49
+
50
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
51
+ should probably proofread and complete it, then remove this comment. -->
52
+
53
+ # german-jeopardy-mt5-large-1k-64-constant
54
+
55
+ This model is a fine-tuned version of [google/mt5-large](https://huggingface.co/google/mt5-large) on the [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad) dataset.
56
+ It achieves the following results on the evaluation set:
57
+ - Loss: 1.8162
58
+ - Brevity Penalty: 0.9152
59
+ - System Length: 19102
60
+ - Reference Length: 20793
61
+ - ROUGE-1: 41.68
62
+ - ROUGE-2: 22.07
63
+ - ROUGE-L: 40.20
64
+ - ROUGE-Lsum: 40.19
65
+ - Exact Match: 2.77
66
+ - BLEU: 15.09
67
+ - F1: 40.69
68
+
69
+ ## Model description
70
+
71
+
72
+ See [google/mt5-large](https://huggingface.co/google/mt5-large) for the model architecture.
73
+ The model was trained on a single NVIDIA RTX 3090 GPU with 24GB of VRAM.
74
+
75
+ ## Intended uses & limitations
76
+
77
+ This model can be used for question generation on German text.
78
+
79
+ ## Training and evaluation data
80
+
81
+ See [lmqg/qg_dequad](https://huggingface.co/datasets/lmqg/qg_dequad).
82
+
83
+ ## Training procedure
84
+
85
+ ### Training hyperparameters
86
+
87
+ The following hyperparameters were used during training:
88
+ - learning_rate: 0.0001
89
+ - train_batch_size: 1
90
+ - eval_batch_size: 1
91
+ - seed: 7
92
+ - gradient_accumulation_steps: 64
93
+ - total_train_batch_size: 64
94
+ - optimizer: Adafactor
95
+ - lr_scheduler_type: constant
96
+ - num_epochs: 20
97
+
98
+ ### Training results
99
+
100
+ | Training Loss | Epoch | Step | BLEU | Brevity Penalty | Counts 1 | Counts 2 | Counts 3 | Counts 4 | Exact Match | F1 | Mean Generated Length | Validation Loss | Precisions 1 | Precisions 2 | Precisions 3 | Precisions 4 | Reference Length | ROUGE-1 | ROUGE-2 | ROUGE-L | ROUGE-Lsum | System Length | Totals 1 | Totals 2 | Totals 3 | Totals 4 |
101
+ |:-------------:|:-----:|:----:|:-------:|:---------------:|:--------:|:--------:|:--------:|:--------:|:-----------:|:------:|:---------------------:|:---------------:|:------------:|:------------:|:------------:|:------------:|:----------------:|:-------:|:-------:|:-------:|:----------:|:-------------:|:--------:|:--------:|:--------:|:--------:|
102
+ | 2.732 | 1.0 | 145 | 12.4473 | 0.7805 | 7779 | 2893 | 1393 | 685 | 0.0168 | 0.3393 | 12.2523 | 1.2989 | 45.6809 | 19.5143 | 11.0372 | 6.5758 | 21250 | 0.3487 | 0.1796 | 0.3329 | 0.3327 | 17029 | 17029 | 14825 | 12621 | 10417 |
103
+ | 1.5514 | 2.0 | 291 | 14.7663 | 0.7871 | 8297 | 3336 | 1711 | 899 | 0.025 | 0.3743 | 12.441 | 1.2100 | 48.3931 | 22.3278 | 13.4333 | 8.5351 | 21250 | 0.3839 | 0.2089 | 0.3688 | 0.369 | 17145 | 17145 | 14941 | 12737 | 10533 |
104
+ | 1.3546 | 3.0 | 435 | 1.1428 | 8930 | 3713 | 1905 | 1022 | 17018 | 14814 | 12610 | 10406 | 52.4739 | 25.0641 | 15.1071 | 9.8213 | 0.7798 | 17018 | 21250 | 0.4225 | 0.2345 | 0.4075 | 0.4074 | 0.034 | 16.3903 | 12.6021 | 0.4155 |
105
+ | 1.1969 | 4.0 | 581 | 1.1113 | 9456 | 3994 | 2096 | 1157 | 18171 | 15967 | 13763 | 11559 | 52.039 | 25.0141 | 15.2292 | 10.0095 | 0.8441 | 18171 | 21250 | 0.4409 | 0.246 | 0.4251 | 0.4251 | 0.0386 | 17.8161 | 13.4061 | 0.4334 |
106
+ | 1.0876 | 5.0 | 726 | 1.1032 | 9606 | 4162 | 2233 | 1243 | 18179 | 15975 | 13771 | 11567 | 52.8412 | 26.0532 | 16.2152 | 10.7461 | 0.8446 | 18179 | 21250 | 0.4504 | 0.2571 | 0.4356 | 0.4357 | 0.0377 | 18.6911 | 13.5599 | 0.443 |
107
+ | 0.9881 | 6.0 | 872 | 1.1119 | 9608 | 4167 | 2235 | 1246 | 18245 | 16041 | 13837 | 11633 | 52.661 | 25.9772 | 16.1523 | 10.7109 | 0.8481 | 18245 | 21250 | 0.4505 | 0.2567 | 0.4348 | 0.4349 | 0.044 | 18.7071 | 13.6978 | 0.4429 |
108
+ | 0.9142 | 7.0 | 1017 | 1.1106 | 9757 | 4285 | 2311 | 1310 | 18291 | 16087 | 13883 | 11679 | 53.3432 | 26.6364 | 16.6463 | 11.2167 | 0.8506 | 18291 | 21250 | 0.4587 | 0.2641 | 0.4427 | 0.443 | 0.0495 | 19.3053 | 13.5826 | 0.451 |
109
+ | 0.8323 | 8.0 | 1163 | 1.1327 | 9757 | 4300 | 2341 | 1317 | 18293 | 16089 | 13885 | 11681 | 53.3373 | 26.7263 | 16.8599 | 11.2747 | 0.8507 | 18293 | 21250 | 0.4587 | 0.2662 | 0.4429 | 0.4426 | 0.0472 | 19.4102 | 13.6239 | 0.4513 |
110
+ | 0.7742 | 9.0 | 1308 | 1.1574 | 9757 | 4273 | 2324 | 1320 | 18273 | 16069 | 13865 | 11661 | 53.3957 | 26.5916 | 16.7616 | 11.3198 | 0.8497 | 18273 | 21250 | 0.4585 | 0.2653 | 0.4431 | 0.443 | 0.049 | 19.3574 | 13.5944 | 0.451 |
111
+ | 0.7101 | 10.0 | 1454 | 1.1674 | 9861 | 4403 | 2438 | 1416 | 18641 | 16437 | 14233 | 12029 | 52.8995 | 26.7871 | 17.1292 | 11.7716 | 0.8694 | 18641 | 21250 | 0.4594 | 0.2689 | 0.444 | 0.4435 | 0.0531 | 20.1003 | 13.9133 | 0.4525 |
112
+ | 0.6642 | 10.99 | 1599 | 1.1889 | 9868 | 4380 | 2358 | 1337 | 18386 | 16182 | 13978 | 11774 | 53.6713 | 27.0671 | 16.8694 | 11.3555 | 0.8558 | 18386 | 21250 | 0.4622 | 0.2694 | 0.4469 | 0.4466 | 0.0476 | 19.655 | 13.9142 | 0.4551 |
113
+ | 0.6067 | 12.0 | 1745 | 1.2207 | 9872 | 4384 | 2408 | 1395 | 18894 | 16690 | 14486 | 12282 | 52.2494 | 26.2672 | 16.6229 | 11.3581 | 0.8828 | 18894 | 21250 | 0.4569 | 0.2667 | 0.441 | 0.4408 | 0.0472 | 19.9169 | 14.2482 | 0.4489 |
114
+ | 0.5684 | 12.99 | 1890 | 1.2587 | 9870 | 4356 | 2360 | 1329 | 18901 | 16697 | 14493 | 12289 | 52.2195 | 26.0885 | 16.2837 | 10.8145 | 0.8831 | 18901 | 21250 | 0.4581 | 0.2651 | 0.4414 | 0.4409 | 0.0485 | 19.5451 | 14.2432 | 0.4506 |
115
+ | 0.5288 | 14.0 | 2036 | 1.2804 | 9815 | 4360 | 2389 | 1335 | 18367 | 16163 | 13959 | 11755 | 53.4382 | 26.9752 | 17.1144 | 11.3569 | 0.8547 | 18367 | 21250 | 0.4592 | 0.2671 | 0.4443 | 0.4436 | 0.0454 | 19.6648 | 13.7432 | 0.4504 |
116
+ | 0.4902 | 14.99 | 2181 | 1.3211 | 9886 | 4407 | 2398 | 1359 | 18777 | 16573 | 14369 | 12165 | 52.6495 | 26.5914 | 16.6887 | 11.1714 | 0.8766 | 18777 | 21250 | 0.4582 | 0.2674 | 0.4426 | 0.4421 | 0.0495 | 19.8138 | 14.1225 | 0.451 |
117
+ | 0.4498 | 16.0 | 2327 | 1.3621 | 10008 | 4477 | 2456 | 1381 | 19399 | 17195 | 14991 | 12787 | 51.5903 | 26.0366 | 16.3832 | 10.8 | 0.909 | 19399 | 21250 | 0.4569 | 0.2679 | 0.4415 | 0.4412 | 0.0476 | 20.0703 | 14.3725 | 0.4491 |
118
+ | 0.4216 | 16.99 | 2472 | 1.3967 | 10016 | 4483 | 2455 | 1385 | 19125 | 16921 | 14717 | 12513 | 52.3712 | 26.4937 | 16.6814 | 11.0685 | 0.8948 | 19125 | 21250 | 0.4615 | 0.2705 | 0.4457 | 0.4451 | 0.0481 | 20.1319 | 14.3008 | 0.4531 |
119
+ | 0.3829 | 18.0 | 2618 | 1.4460 | 9976 | 4407 | 2412 | 1374 | 19464 | 17260 | 15056 | 12852 | 51.2536 | 25.533 | 16.0202 | 10.6909 | 0.9123 | 19464 | 21250 | 0.4556 | 0.2627 | 0.4387 | 0.4385 | 0.0476 | 19.8508 | 14.7046 | 0.4479 |
120
+ | 0.3551 | 19.0 | 2764 | 1.4725 | 10010 | 4451 | 2438 | 1385 | 19131 | 16927 | 14723 | 12519 | 52.3235 | 26.2953 | 16.5591 | 11.0632 | 0.8952 | 19131 | 21250 | 0.4606 | 0.2672 | 0.4438 | 0.4434 | 0.0463 | 20.0572 | 14.3807 | 0.4523 |
121
+ | 0.3301 | 19.93 | 2900 | 1.5030 | 9858 | 4378 | 2406 | 1368 | 18872 | 16668 | 14464 | 12260 | 52.2361 | 26.2659 | 16.6344 | 11.1582 | 0.8816 | 18872 | 21250 | 0.4569 | 0.2644 | 0.4412 | 0.4405 | 0.0495 | 19.8047 | 14.2795 | 0.4483 |
122
+
123
+
124
+ ### Framework versions
125
+
126
+ - Transformers 4.32.1
127
+ - Pytorch 2.1.0
128
+ - Datasets 2.12.0
129
+ - Tokenizers 0.13.3
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<hl>": 250100
3
+ }
all_results.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.93,
3
+ "eval_bleu": 19.8047,
4
+ "eval_bp": 0.8816,
5
+ "eval_counts_1": 9858,
6
+ "eval_counts_2": 4378,
7
+ "eval_counts_3": 2406,
8
+ "eval_counts_4": 1368,
9
+ "eval_exact_match": 0.0495,
10
+ "eval_f1": 0.4483,
11
+ "eval_gen_len": 14.2795,
12
+ "eval_loss": 1.5030488967895508,
13
+ "eval_precisions_1": 52.2361,
14
+ "eval_precisions_2": 26.2659,
15
+ "eval_precisions_3": 16.6344,
16
+ "eval_precisions_4": 11.1582,
17
+ "eval_ref_len": 21250,
18
+ "eval_rouge1": 0.4569,
19
+ "eval_rouge2": 0.2644,
20
+ "eval_rougeL": 0.4412,
21
+ "eval_rougeLsum": 0.4405,
22
+ "eval_runtime": 2187.575,
23
+ "eval_samples": 2204,
24
+ "eval_samples_per_second": 1.008,
25
+ "eval_steps_per_second": 1.008,
26
+ "eval_sys_len": 18872,
27
+ "eval_totals_1": 18872,
28
+ "eval_totals_2": 16668,
29
+ "eval_totals_3": 14464,
30
+ "eval_totals_4": 12260,
31
+ "predict_bleu": 14.494,
32
+ "predict_bp": 0.9213,
33
+ "predict_counts_1": 8898,
34
+ "predict_counts_2": 3367,
35
+ "predict_counts_3": 1600,
36
+ "predict_counts_4": 780,
37
+ "predict_exact_match": 0.0268,
38
+ "predict_f1": 0.3965,
39
+ "predict_gen_len": 14.6842,
40
+ "predict_loss": 1.8663314580917358,
41
+ "predict_precisions_1": 46.3028,
42
+ "predict_precisions_2": 19.7907,
43
+ "predict_precisions_3": 10.8042,
44
+ "predict_precisions_4": 6.188,
45
+ "predict_ref_len": 20793,
46
+ "predict_rouge1": 0.4064,
47
+ "predict_rouge2": 0.2138,
48
+ "predict_rougeL": 0.3919,
49
+ "predict_rougeLsum": 0.3914,
50
+ "predict_runtime": 2245.7822,
51
+ "predict_samples": 2204,
52
+ "predict_samples_per_second": 0.981,
53
+ "predict_steps_per_second": 0.981,
54
+ "predict_sys_len": 19217,
55
+ "predict_totals_1": 19217,
56
+ "predict_totals_2": 17013,
57
+ "predict_totals_3": 14809,
58
+ "predict_totals_4": 12605,
59
+ "train_loss": 0.6333936349276839,
60
+ "train_runtime": 110329.1002,
61
+ "train_samples": 9314,
62
+ "train_samples_per_second": 1.688,
63
+ "train_steps_per_second": 0.026
64
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-large",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2816,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "length_penalty": 0.0,
20
+ "max_length": 64,
21
+ "model_type": "mt5",
22
+ "num_beams": 4,
23
+ "num_decoder_layers": 24,
24
+ "num_heads": 16,
25
+ "num_layers": 24,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "relative_attention_max_distance": 128,
29
+ "relative_attention_num_buckets": 32,
30
+ "tie_word_embeddings": false,
31
+ "tokenizer_class": "T5Tokenizer",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.32.1",
34
+ "use_cache": true,
35
+ "vocab_size": 250112
36
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "length_penalty": 0.0,
5
+ "max_length": 64,
6
+ "num_beams": 4,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "4.32.1"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d4911da46fd490fd525410a130c54ae158cf2bcef89fa6e7ef555fd40801997
3
+ size 4918393736
runs/Oct21_13-27-58_MARVIN-PC/events.out.tfevents.1697887732.MARVIN-PC.18012.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0847a597cdc4989ca81f25e62905cdf4eda01996fb998004123627107836cfa9
3
+ size 7812
runs/Oct21_16-57-04_MARVIN-PC/events.out.tfevents.1697900288.MARVIN-PC.9500.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b320f494619d940760bfbc53359b96650beb2f1ef4979dc24e5494b23e1d81
3
+ size 34070
runs/Oct21_16-57-04_MARVIN-PC/events.out.tfevents.1698012826.MARVIN-PC.9500.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d529b50ea1f1561f84443e763b918fe190a4b5c7f385c50217f60ab99b16f839
3
+ size 1550
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<hl>"
4
+ ],
5
+ "eos_token": "</s>",
6
+ "pad_token": "<pad>",
7
+ "unk_token": "<unk>"
8
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ef4273f90fa3341fdb81c66eb7973b1651a8183e57c8ab9e9144aac61b48f9f
3
+ size 16330550
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "clean_up_tokenization_spaces": true,
4
+ "eos_token": "</s>",
5
+ "extra_ids": 0,
6
+ "legacy": true,
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "<pad>",
9
+ "sp_model_kwargs": {},
10
+ "tokenizer_class": "T5Tokenizer",
11
+ "unk_token": "<unk>"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.934292462958986,
5
+ "eval_steps": 500,
6
+ "global_step": 2900,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "learning_rate": 0.0001,
14
+ "loss": 2.732,
15
+ "step": 145
16
+ },
17
+ {
18
+ "epoch": 1.0,
19
+ "eval_bleu": 12.4473,
20
+ "eval_bp": 0.7805,
21
+ "eval_counts_1": 7779,
22
+ "eval_counts_2": 2893,
23
+ "eval_counts_3": 1393,
24
+ "eval_counts_4": 685,
25
+ "eval_exact_match": 0.0168,
26
+ "eval_f1": 0.3393,
27
+ "eval_gen_len": 12.2523,
28
+ "eval_loss": 1.2989141941070557,
29
+ "eval_precisions_1": 45.6809,
30
+ "eval_precisions_2": 19.5143,
31
+ "eval_precisions_3": 11.0372,
32
+ "eval_precisions_4": 6.5758,
33
+ "eval_ref_len": 21250,
34
+ "eval_rouge1": 0.3487,
35
+ "eval_rouge2": 0.1796,
36
+ "eval_rougeL": 0.3329,
37
+ "eval_rougeLsum": 0.3327,
38
+ "eval_runtime": 2048.1193,
39
+ "eval_samples_per_second": 1.076,
40
+ "eval_steps_per_second": 1.076,
41
+ "eval_sys_len": 17029,
42
+ "eval_totals_1": 17029,
43
+ "eval_totals_2": 14825,
44
+ "eval_totals_3": 12621,
45
+ "eval_totals_4": 10417,
46
+ "step": 145
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "learning_rate": 0.0001,
51
+ "loss": 1.5514,
52
+ "step": 291
53
+ },
54
+ {
55
+ "epoch": 2.0,
56
+ "eval_bleu": 14.7663,
57
+ "eval_bp": 0.7871,
58
+ "eval_counts_1": 8297,
59
+ "eval_counts_2": 3336,
60
+ "eval_counts_3": 1711,
61
+ "eval_counts_4": 899,
62
+ "eval_exact_match": 0.025,
63
+ "eval_f1": 0.3743,
64
+ "eval_gen_len": 12.441,
65
+ "eval_loss": 1.20997154712677,
66
+ "eval_precisions_1": 48.3931,
67
+ "eval_precisions_2": 22.3278,
68
+ "eval_precisions_3": 13.4333,
69
+ "eval_precisions_4": 8.5351,
70
+ "eval_ref_len": 21250,
71
+ "eval_rouge1": 0.3839,
72
+ "eval_rouge2": 0.2089,
73
+ "eval_rougeL": 0.3688,
74
+ "eval_rougeLsum": 0.369,
75
+ "eval_runtime": 2570.171,
76
+ "eval_samples_per_second": 0.858,
77
+ "eval_steps_per_second": 0.858,
78
+ "eval_sys_len": 17145,
79
+ "eval_totals_1": 17145,
80
+ "eval_totals_2": 14941,
81
+ "eval_totals_3": 12737,
82
+ "eval_totals_4": 10533,
83
+ "step": 291
84
+ },
85
+ {
86
+ "epoch": 3.0,
87
+ "learning_rate": 0.0001,
88
+ "loss": 1.3546,
89
+ "step": 435
90
+ },
91
+ {
92
+ "epoch": 3.0,
93
+ "eval_bleu": 16.3903,
94
+ "eval_bp": 0.7798,
95
+ "eval_counts_1": 8930,
96
+ "eval_counts_2": 3713,
97
+ "eval_counts_3": 1905,
98
+ "eval_counts_4": 1022,
99
+ "eval_exact_match": 0.034,
100
+ "eval_f1": 0.4155,
101
+ "eval_gen_len": 12.6021,
102
+ "eval_loss": 1.142831563949585,
103
+ "eval_precisions_1": 52.4739,
104
+ "eval_precisions_2": 25.0641,
105
+ "eval_precisions_3": 15.1071,
106
+ "eval_precisions_4": 9.8213,
107
+ "eval_ref_len": 21250,
108
+ "eval_rouge1": 0.4225,
109
+ "eval_rouge2": 0.2345,
110
+ "eval_rougeL": 0.4075,
111
+ "eval_rougeLsum": 0.4074,
112
+ "eval_runtime": 2923.7087,
113
+ "eval_samples_per_second": 0.754,
114
+ "eval_steps_per_second": 0.754,
115
+ "eval_sys_len": 17018,
116
+ "eval_totals_1": 17018,
117
+ "eval_totals_2": 14814,
118
+ "eval_totals_3": 12610,
119
+ "eval_totals_4": 10406,
120
+ "step": 435
121
+ },
122
+ {
123
+ "epoch": 4.0,
124
+ "learning_rate": 0.0001,
125
+ "loss": 1.1969,
126
+ "step": 581
127
+ },
128
+ {
129
+ "epoch": 4.0,
130
+ "eval_bleu": 17.8161,
131
+ "eval_bp": 0.8441,
132
+ "eval_counts_1": 9456,
133
+ "eval_counts_2": 3994,
134
+ "eval_counts_3": 2096,
135
+ "eval_counts_4": 1157,
136
+ "eval_exact_match": 0.0386,
137
+ "eval_f1": 0.4334,
138
+ "eval_gen_len": 13.4061,
139
+ "eval_loss": 1.1113450527191162,
140
+ "eval_precisions_1": 52.039,
141
+ "eval_precisions_2": 25.0141,
142
+ "eval_precisions_3": 15.2292,
143
+ "eval_precisions_4": 10.0095,
144
+ "eval_ref_len": 21250,
145
+ "eval_rouge1": 0.4409,
146
+ "eval_rouge2": 0.246,
147
+ "eval_rougeL": 0.4251,
148
+ "eval_rougeLsum": 0.4251,
149
+ "eval_runtime": 2741.9646,
150
+ "eval_samples_per_second": 0.804,
151
+ "eval_steps_per_second": 0.804,
152
+ "eval_sys_len": 18171,
153
+ "eval_totals_1": 18171,
154
+ "eval_totals_2": 15967,
155
+ "eval_totals_3": 13763,
156
+ "eval_totals_4": 11559,
157
+ "step": 581
158
+ },
159
+ {
160
+ "epoch": 5.0,
161
+ "learning_rate": 0.0001,
162
+ "loss": 1.0876,
163
+ "step": 726
164
+ },
165
+ {
166
+ "epoch": 5.0,
167
+ "eval_bleu": 18.6911,
168
+ "eval_bp": 0.8446,
169
+ "eval_counts_1": 9606,
170
+ "eval_counts_2": 4162,
171
+ "eval_counts_3": 2233,
172
+ "eval_counts_4": 1243,
173
+ "eval_exact_match": 0.0377,
174
+ "eval_f1": 0.443,
175
+ "eval_gen_len": 13.5599,
176
+ "eval_loss": 1.1031831502914429,
177
+ "eval_precisions_1": 52.8412,
178
+ "eval_precisions_2": 26.0532,
179
+ "eval_precisions_3": 16.2152,
180
+ "eval_precisions_4": 10.7461,
181
+ "eval_ref_len": 21250,
182
+ "eval_rouge1": 0.4504,
183
+ "eval_rouge2": 0.2571,
184
+ "eval_rougeL": 0.4356,
185
+ "eval_rougeLsum": 0.4357,
186
+ "eval_runtime": 3812.6899,
187
+ "eval_samples_per_second": 0.578,
188
+ "eval_steps_per_second": 0.578,
189
+ "eval_sys_len": 18179,
190
+ "eval_totals_1": 18179,
191
+ "eval_totals_2": 15975,
192
+ "eval_totals_3": 13771,
193
+ "eval_totals_4": 11567,
194
+ "step": 726
195
+ },
196
+ {
197
+ "epoch": 6.0,
198
+ "learning_rate": 0.0001,
199
+ "loss": 0.9881,
200
+ "step": 872
201
+ },
202
+ {
203
+ "epoch": 6.0,
204
+ "eval_bleu": 18.7071,
205
+ "eval_bp": 0.8481,
206
+ "eval_counts_1": 9608,
207
+ "eval_counts_2": 4167,
208
+ "eval_counts_3": 2235,
209
+ "eval_counts_4": 1246,
210
+ "eval_exact_match": 0.044,
211
+ "eval_f1": 0.4429,
212
+ "eval_gen_len": 13.6978,
213
+ "eval_loss": 1.1118519306182861,
214
+ "eval_precisions_1": 52.661,
215
+ "eval_precisions_2": 25.9772,
216
+ "eval_precisions_3": 16.1523,
217
+ "eval_precisions_4": 10.7109,
218
+ "eval_ref_len": 21250,
219
+ "eval_rouge1": 0.4505,
220
+ "eval_rouge2": 0.2567,
221
+ "eval_rougeL": 0.4348,
222
+ "eval_rougeLsum": 0.4349,
223
+ "eval_runtime": 2020.0708,
224
+ "eval_samples_per_second": 1.091,
225
+ "eval_steps_per_second": 1.091,
226
+ "eval_sys_len": 18245,
227
+ "eval_totals_1": 18245,
228
+ "eval_totals_2": 16041,
229
+ "eval_totals_3": 13837,
230
+ "eval_totals_4": 11633,
231
+ "step": 872
232
+ },
233
+ {
234
+ "epoch": 7.0,
235
+ "learning_rate": 0.0001,
236
+ "loss": 0.9142,
237
+ "step": 1017
238
+ },
239
+ {
240
+ "epoch": 7.0,
241
+ "eval_bleu": 19.3053,
242
+ "eval_bp": 0.8506,
243
+ "eval_counts_1": 9757,
244
+ "eval_counts_2": 4285,
245
+ "eval_counts_3": 2311,
246
+ "eval_counts_4": 1310,
247
+ "eval_exact_match": 0.0495,
248
+ "eval_f1": 0.451,
249
+ "eval_gen_len": 13.5826,
250
+ "eval_loss": 1.1105936765670776,
251
+ "eval_precisions_1": 53.3432,
252
+ "eval_precisions_2": 26.6364,
253
+ "eval_precisions_3": 16.6463,
254
+ "eval_precisions_4": 11.2167,
255
+ "eval_ref_len": 21250,
256
+ "eval_rouge1": 0.4587,
257
+ "eval_rouge2": 0.2641,
258
+ "eval_rougeL": 0.4427,
259
+ "eval_rougeLsum": 0.443,
260
+ "eval_runtime": 1991.0459,
261
+ "eval_samples_per_second": 1.107,
262
+ "eval_steps_per_second": 1.107,
263
+ "eval_sys_len": 18291,
264
+ "eval_totals_1": 18291,
265
+ "eval_totals_2": 16087,
266
+ "eval_totals_3": 13883,
267
+ "eval_totals_4": 11679,
268
+ "step": 1017
269
+ },
270
+ {
271
+ "epoch": 8.0,
272
+ "learning_rate": 0.0001,
273
+ "loss": 0.8323,
274
+ "step": 1163
275
+ },
276
+ {
277
+ "epoch": 8.0,
278
+ "eval_bleu": 19.4102,
279
+ "eval_bp": 0.8507,
280
+ "eval_counts_1": 9757,
281
+ "eval_counts_2": 4300,
282
+ "eval_counts_3": 2341,
283
+ "eval_counts_4": 1317,
284
+ "eval_exact_match": 0.0472,
285
+ "eval_f1": 0.4513,
286
+ "eval_gen_len": 13.6239,
287
+ "eval_loss": 1.1327157020568848,
288
+ "eval_precisions_1": 53.3373,
289
+ "eval_precisions_2": 26.7263,
290
+ "eval_precisions_3": 16.8599,
291
+ "eval_precisions_4": 11.2747,
292
+ "eval_ref_len": 21250,
293
+ "eval_rouge1": 0.4587,
294
+ "eval_rouge2": 0.2662,
295
+ "eval_rougeL": 0.4429,
296
+ "eval_rougeLsum": 0.4426,
297
+ "eval_runtime": 1972.0648,
298
+ "eval_samples_per_second": 1.118,
299
+ "eval_steps_per_second": 1.118,
300
+ "eval_sys_len": 18293,
301
+ "eval_totals_1": 18293,
302
+ "eval_totals_2": 16089,
303
+ "eval_totals_3": 13885,
304
+ "eval_totals_4": 11681,
305
+ "step": 1163
306
+ },
307
+ {
308
+ "epoch": 9.0,
309
+ "learning_rate": 0.0001,
310
+ "loss": 0.7742,
311
+ "step": 1308
312
+ },
313
+ {
314
+ "epoch": 9.0,
315
+ "eval_bleu": 19.3574,
316
+ "eval_bp": 0.8497,
317
+ "eval_counts_1": 9757,
318
+ "eval_counts_2": 4273,
319
+ "eval_counts_3": 2324,
320
+ "eval_counts_4": 1320,
321
+ "eval_exact_match": 0.049,
322
+ "eval_f1": 0.451,
323
+ "eval_gen_len": 13.5944,
324
+ "eval_loss": 1.1574428081512451,
325
+ "eval_precisions_1": 53.3957,
326
+ "eval_precisions_2": 26.5916,
327
+ "eval_precisions_3": 16.7616,
328
+ "eval_precisions_4": 11.3198,
329
+ "eval_ref_len": 21250,
330
+ "eval_rouge1": 0.4585,
331
+ "eval_rouge2": 0.2653,
332
+ "eval_rougeL": 0.4431,
333
+ "eval_rougeLsum": 0.443,
334
+ "eval_runtime": 1991.8737,
335
+ "eval_samples_per_second": 1.106,
336
+ "eval_steps_per_second": 1.106,
337
+ "eval_sys_len": 18273,
338
+ "eval_totals_1": 18273,
339
+ "eval_totals_2": 16069,
340
+ "eval_totals_3": 13865,
341
+ "eval_totals_4": 11661,
342
+ "step": 1308
343
+ },
344
+ {
345
+ "epoch": 10.0,
346
+ "learning_rate": 0.0001,
347
+ "loss": 0.7101,
348
+ "step": 1454
349
+ },
350
+ {
351
+ "epoch": 10.0,
352
+ "eval_bleu": 20.1003,
353
+ "eval_bp": 0.8694,
354
+ "eval_counts_1": 9861,
355
+ "eval_counts_2": 4403,
356
+ "eval_counts_3": 2438,
357
+ "eval_counts_4": 1416,
358
+ "eval_exact_match": 0.0531,
359
+ "eval_f1": 0.4525,
360
+ "eval_gen_len": 13.9133,
361
+ "eval_loss": 1.167409896850586,
362
+ "eval_precisions_1": 52.8995,
363
+ "eval_precisions_2": 26.7871,
364
+ "eval_precisions_3": 17.1292,
365
+ "eval_precisions_4": 11.7716,
366
+ "eval_ref_len": 21250,
367
+ "eval_rouge1": 0.4594,
368
+ "eval_rouge2": 0.2689,
369
+ "eval_rougeL": 0.444,
370
+ "eval_rougeLsum": 0.4435,
371
+ "eval_runtime": 2025.3437,
372
+ "eval_samples_per_second": 1.088,
373
+ "eval_steps_per_second": 1.088,
374
+ "eval_sys_len": 18641,
375
+ "eval_totals_1": 18641,
376
+ "eval_totals_2": 16437,
377
+ "eval_totals_3": 14233,
378
+ "eval_totals_4": 12029,
379
+ "step": 1454
380
+ },
381
+ {
382
+ "epoch": 10.99,
383
+ "learning_rate": 0.0001,
384
+ "loss": 0.6642,
385
+ "step": 1599
386
+ },
387
+ {
388
+ "epoch": 10.99,
389
+ "eval_bleu": 19.655,
390
+ "eval_bp": 0.8558,
391
+ "eval_counts_1": 9868,
392
+ "eval_counts_2": 4380,
393
+ "eval_counts_3": 2358,
394
+ "eval_counts_4": 1337,
395
+ "eval_exact_match": 0.0476,
396
+ "eval_f1": 0.4551,
397
+ "eval_gen_len": 13.9142,
398
+ "eval_loss": 1.1888612508773804,
399
+ "eval_precisions_1": 53.6713,
400
+ "eval_precisions_2": 27.0671,
401
+ "eval_precisions_3": 16.8694,
402
+ "eval_precisions_4": 11.3555,
403
+ "eval_ref_len": 21250,
404
+ "eval_rouge1": 0.4622,
405
+ "eval_rouge2": 0.2694,
406
+ "eval_rougeL": 0.4469,
407
+ "eval_rougeLsum": 0.4466,
408
+ "eval_runtime": 2020.9205,
409
+ "eval_samples_per_second": 1.091,
410
+ "eval_steps_per_second": 1.091,
411
+ "eval_sys_len": 18386,
412
+ "eval_totals_1": 18386,
413
+ "eval_totals_2": 16182,
414
+ "eval_totals_3": 13978,
415
+ "eval_totals_4": 11774,
416
+ "step": 1599
417
+ },
418
+ {
419
+ "epoch": 12.0,
420
+ "learning_rate": 0.0001,
421
+ "loss": 0.6067,
422
+ "step": 1745
423
+ },
424
+ {
425
+ "epoch": 12.0,
426
+ "eval_bleu": 19.9169,
427
+ "eval_bp": 0.8828,
428
+ "eval_counts_1": 9872,
429
+ "eval_counts_2": 4384,
430
+ "eval_counts_3": 2408,
431
+ "eval_counts_4": 1395,
432
+ "eval_exact_match": 0.0472,
433
+ "eval_f1": 0.4489,
434
+ "eval_gen_len": 14.2482,
435
+ "eval_loss": 1.2207266092300415,
436
+ "eval_precisions_1": 52.2494,
437
+ "eval_precisions_2": 26.2672,
438
+ "eval_precisions_3": 16.6229,
439
+ "eval_precisions_4": 11.3581,
440
+ "eval_ref_len": 21250,
441
+ "eval_rouge1": 0.4569,
442
+ "eval_rouge2": 0.2667,
443
+ "eval_rougeL": 0.441,
444
+ "eval_rougeLsum": 0.4408,
445
+ "eval_runtime": 2047.2616,
446
+ "eval_samples_per_second": 1.077,
447
+ "eval_steps_per_second": 1.077,
448
+ "eval_sys_len": 18894,
449
+ "eval_totals_1": 18894,
450
+ "eval_totals_2": 16690,
451
+ "eval_totals_3": 14486,
452
+ "eval_totals_4": 12282,
453
+ "step": 1745
454
+ },
455
+ {
456
+ "epoch": 12.99,
457
+ "learning_rate": 0.0001,
458
+ "loss": 0.5684,
459
+ "step": 1890
460
+ },
461
+ {
462
+ "epoch": 12.99,
463
+ "eval_bleu": 19.5451,
464
+ "eval_bp": 0.8831,
465
+ "eval_counts_1": 9870,
466
+ "eval_counts_2": 4356,
467
+ "eval_counts_3": 2360,
468
+ "eval_counts_4": 1329,
469
+ "eval_exact_match": 0.0485,
470
+ "eval_f1": 0.4506,
471
+ "eval_gen_len": 14.2432,
472
+ "eval_loss": 1.2586854696273804,
473
+ "eval_precisions_1": 52.2195,
474
+ "eval_precisions_2": 26.0885,
475
+ "eval_precisions_3": 16.2837,
476
+ "eval_precisions_4": 10.8145,
477
+ "eval_ref_len": 21250,
478
+ "eval_rouge1": 0.4581,
479
+ "eval_rouge2": 0.2651,
480
+ "eval_rougeL": 0.4414,
481
+ "eval_rougeLsum": 0.4409,
482
+ "eval_runtime": 2126.3316,
483
+ "eval_samples_per_second": 1.037,
484
+ "eval_steps_per_second": 1.037,
485
+ "eval_sys_len": 18901,
486
+ "eval_totals_1": 18901,
487
+ "eval_totals_2": 16697,
488
+ "eval_totals_3": 14493,
489
+ "eval_totals_4": 12289,
490
+ "step": 1890
491
+ },
492
+ {
493
+ "epoch": 14.0,
494
+ "learning_rate": 0.0001,
495
+ "loss": 0.5288,
496
+ "step": 2036
497
+ },
498
+ {
499
+ "epoch": 14.0,
500
+ "eval_bleu": 19.6648,
501
+ "eval_bp": 0.8547,
502
+ "eval_counts_1": 9815,
503
+ "eval_counts_2": 4360,
504
+ "eval_counts_3": 2389,
505
+ "eval_counts_4": 1335,
506
+ "eval_exact_match": 0.0454,
507
+ "eval_f1": 0.4504,
508
+ "eval_gen_len": 13.7432,
509
+ "eval_loss": 1.2803738117218018,
510
+ "eval_precisions_1": 53.4382,
511
+ "eval_precisions_2": 26.9752,
512
+ "eval_precisions_3": 17.1144,
513
+ "eval_precisions_4": 11.3569,
514
+ "eval_ref_len": 21250,
515
+ "eval_rouge1": 0.4592,
516
+ "eval_rouge2": 0.2671,
517
+ "eval_rougeL": 0.4443,
518
+ "eval_rougeLsum": 0.4436,
519
+ "eval_runtime": 3989.4275,
520
+ "eval_samples_per_second": 0.552,
521
+ "eval_steps_per_second": 0.552,
522
+ "eval_sys_len": 18367,
523
+ "eval_totals_1": 18367,
524
+ "eval_totals_2": 16163,
525
+ "eval_totals_3": 13959,
526
+ "eval_totals_4": 11755,
527
+ "step": 2036
528
+ },
529
+ {
530
+ "epoch": 14.99,
531
+ "learning_rate": 0.0001,
532
+ "loss": 0.4902,
533
+ "step": 2181
534
+ },
535
+ {
536
+ "epoch": 14.99,
537
+ "eval_bleu": 19.8138,
538
+ "eval_bp": 0.8766,
539
+ "eval_counts_1": 9886,
540
+ "eval_counts_2": 4407,
541
+ "eval_counts_3": 2398,
542
+ "eval_counts_4": 1359,
543
+ "eval_exact_match": 0.0495,
544
+ "eval_f1": 0.451,
545
+ "eval_gen_len": 14.1225,
546
+ "eval_loss": 1.321104884147644,
547
+ "eval_precisions_1": 52.6495,
548
+ "eval_precisions_2": 26.5914,
549
+ "eval_precisions_3": 16.6887,
550
+ "eval_precisions_4": 11.1714,
551
+ "eval_ref_len": 21250,
552
+ "eval_rouge1": 0.4582,
553
+ "eval_rouge2": 0.2674,
554
+ "eval_rougeL": 0.4426,
555
+ "eval_rougeLsum": 0.4421,
556
+ "eval_runtime": 2190.6068,
557
+ "eval_samples_per_second": 1.006,
558
+ "eval_steps_per_second": 1.006,
559
+ "eval_sys_len": 18777,
560
+ "eval_totals_1": 18777,
561
+ "eval_totals_2": 16573,
562
+ "eval_totals_3": 14369,
563
+ "eval_totals_4": 12165,
564
+ "step": 2181
565
+ },
566
+ {
567
+ "epoch": 16.0,
568
+ "learning_rate": 0.0001,
569
+ "loss": 0.4498,
570
+ "step": 2327
571
+ },
572
+ {
573
+ "epoch": 16.0,
574
+ "eval_bleu": 20.0703,
575
+ "eval_bp": 0.909,
576
+ "eval_counts_1": 10008,
577
+ "eval_counts_2": 4477,
578
+ "eval_counts_3": 2456,
579
+ "eval_counts_4": 1381,
580
+ "eval_exact_match": 0.0476,
581
+ "eval_f1": 0.4491,
582
+ "eval_gen_len": 14.3725,
583
+ "eval_loss": 1.3620938062667847,
584
+ "eval_precisions_1": 51.5903,
585
+ "eval_precisions_2": 26.0366,
586
+ "eval_precisions_3": 16.3832,
587
+ "eval_precisions_4": 10.8,
588
+ "eval_ref_len": 21250,
589
+ "eval_rouge1": 0.4569,
590
+ "eval_rouge2": 0.2679,
591
+ "eval_rougeL": 0.4415,
592
+ "eval_rougeLsum": 0.4412,
593
+ "eval_runtime": 4080.8757,
594
+ "eval_samples_per_second": 0.54,
595
+ "eval_steps_per_second": 0.54,
596
+ "eval_sys_len": 19399,
597
+ "eval_totals_1": 19399,
598
+ "eval_totals_2": 17195,
599
+ "eval_totals_3": 14991,
600
+ "eval_totals_4": 12787,
601
+ "step": 2327
602
+ },
603
+ {
604
+ "epoch": 16.99,
605
+ "learning_rate": 0.0001,
606
+ "loss": 0.4216,
607
+ "step": 2472
608
+ },
609
+ {
610
+ "epoch": 16.99,
611
+ "eval_bleu": 20.1319,
612
+ "eval_bp": 0.8948,
613
+ "eval_counts_1": 10016,
614
+ "eval_counts_2": 4483,
615
+ "eval_counts_3": 2455,
616
+ "eval_counts_4": 1385,
617
+ "eval_exact_match": 0.0481,
618
+ "eval_f1": 0.4531,
619
+ "eval_gen_len": 14.3008,
620
+ "eval_loss": 1.3966974020004272,
621
+ "eval_precisions_1": 52.3712,
622
+ "eval_precisions_2": 26.4937,
623
+ "eval_precisions_3": 16.6814,
624
+ "eval_precisions_4": 11.0685,
625
+ "eval_ref_len": 21250,
626
+ "eval_rouge1": 0.4615,
627
+ "eval_rouge2": 0.2705,
628
+ "eval_rougeL": 0.4457,
629
+ "eval_rougeLsum": 0.4451,
630
+ "eval_runtime": 3311.0939,
631
+ "eval_samples_per_second": 0.666,
632
+ "eval_steps_per_second": 0.666,
633
+ "eval_sys_len": 19125,
634
+ "eval_totals_1": 19125,
635
+ "eval_totals_2": 16921,
636
+ "eval_totals_3": 14717,
637
+ "eval_totals_4": 12513,
638
+ "step": 2472
639
+ },
640
+ {
641
+ "epoch": 18.0,
642
+ "learning_rate": 0.0001,
643
+ "loss": 0.3829,
644
+ "step": 2618
645
+ },
646
+ {
647
+ "epoch": 18.0,
648
+ "eval_bleu": 19.8508,
649
+ "eval_bp": 0.9123,
650
+ "eval_counts_1": 9976,
651
+ "eval_counts_2": 4407,
652
+ "eval_counts_3": 2412,
653
+ "eval_counts_4": 1374,
654
+ "eval_exact_match": 0.0476,
655
+ "eval_f1": 0.4479,
656
+ "eval_gen_len": 14.7046,
657
+ "eval_loss": 1.4460452795028687,
658
+ "eval_precisions_1": 51.2536,
659
+ "eval_precisions_2": 25.533,
660
+ "eval_precisions_3": 16.0202,
661
+ "eval_precisions_4": 10.6909,
662
+ "eval_ref_len": 21250,
663
+ "eval_rouge1": 0.4556,
664
+ "eval_rouge2": 0.2627,
665
+ "eval_rougeL": 0.4387,
666
+ "eval_rougeLsum": 0.4385,
667
+ "eval_runtime": 3748.4463,
668
+ "eval_samples_per_second": 0.588,
669
+ "eval_steps_per_second": 0.588,
670
+ "eval_sys_len": 19464,
671
+ "eval_totals_1": 19464,
672
+ "eval_totals_2": 17260,
673
+ "eval_totals_3": 15056,
674
+ "eval_totals_4": 12852,
675
+ "step": 2618
676
+ },
677
+ {
678
+ "epoch": 19.0,
679
+ "learning_rate": 0.0001,
680
+ "loss": 0.3551,
681
+ "step": 2764
682
+ },
683
+ {
684
+ "epoch": 19.0,
685
+ "eval_bleu": 20.0572,
686
+ "eval_bp": 0.8952,
687
+ "eval_counts_1": 10010,
688
+ "eval_counts_2": 4451,
689
+ "eval_counts_3": 2438,
690
+ "eval_counts_4": 1385,
691
+ "eval_exact_match": 0.0463,
692
+ "eval_f1": 0.4523,
693
+ "eval_gen_len": 14.3807,
694
+ "eval_loss": 1.4725110530853271,
695
+ "eval_precisions_1": 52.3235,
696
+ "eval_precisions_2": 26.2953,
697
+ "eval_precisions_3": 16.5591,
698
+ "eval_precisions_4": 11.0632,
699
+ "eval_ref_len": 21250,
700
+ "eval_rouge1": 0.4606,
701
+ "eval_rouge2": 0.2672,
702
+ "eval_rougeL": 0.4438,
703
+ "eval_rougeLsum": 0.4434,
704
+ "eval_runtime": 2215.2029,
705
+ "eval_samples_per_second": 0.995,
706
+ "eval_steps_per_second": 0.995,
707
+ "eval_sys_len": 19131,
708
+ "eval_totals_1": 19131,
709
+ "eval_totals_2": 16927,
710
+ "eval_totals_3": 14723,
711
+ "eval_totals_4": 12519,
712
+ "step": 2764
713
+ },
714
+ {
715
+ "epoch": 19.93,
716
+ "learning_rate": 0.0001,
717
+ "loss": 0.3301,
718
+ "step": 2900
719
+ },
720
+ {
721
+ "epoch": 19.93,
722
+ "eval_bleu": 19.8047,
723
+ "eval_bp": 0.8816,
724
+ "eval_counts_1": 9858,
725
+ "eval_counts_2": 4378,
726
+ "eval_counts_3": 2406,
727
+ "eval_counts_4": 1368,
728
+ "eval_exact_match": 0.0495,
729
+ "eval_f1": 0.4483,
730
+ "eval_gen_len": 14.2795,
731
+ "eval_loss": 1.5030488967895508,
732
+ "eval_precisions_1": 52.2361,
733
+ "eval_precisions_2": 26.2659,
734
+ "eval_precisions_3": 16.6344,
735
+ "eval_precisions_4": 11.1582,
736
+ "eval_ref_len": 21250,
737
+ "eval_rouge1": 0.4569,
738
+ "eval_rouge2": 0.2644,
739
+ "eval_rougeL": 0.4412,
740
+ "eval_rougeLsum": 0.4405,
741
+ "eval_runtime": 2181.7432,
742
+ "eval_samples_per_second": 1.01,
743
+ "eval_steps_per_second": 1.01,
744
+ "eval_sys_len": 18872,
745
+ "eval_totals_1": 18872,
746
+ "eval_totals_2": 16668,
747
+ "eval_totals_3": 14464,
748
+ "eval_totals_4": 12260,
749
+ "step": 2900
750
+ },
751
+ {
752
+ "epoch": 19.93,
753
+ "step": 2900,
754
+ "total_flos": 1.1100924470624256e+18,
755
+ "train_loss": 0.6333936349276839,
756
+ "train_runtime": 110329.1002,
757
+ "train_samples_per_second": 1.688,
758
+ "train_steps_per_second": 0.026
759
+ }
760
+ ],
761
+ "logging_steps": 500,
762
+ "max_steps": 2900,
763
+ "num_train_epochs": 20,
764
+ "save_steps": 500,
765
+ "total_flos": 1.1100924470624256e+18,
766
+ "trial_name": null,
767
+ "trial_params": null
768
+ }
training_args.bin ADDED
Binary file (4.66 kB). View file