riken01 commited on
Commit
185828d
1 Parent(s): 955fb19

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - autotrain
5
+ - text2text-generation
6
+ base_model: facebook/bart-large-cnn
7
+ widget:
8
+ - text: "I love AutoTrain"
9
+ ---
10
+
11
+ # Model Trained Using AutoTrain
12
+
13
+ - Problem type: Seq2Seq
14
+
15
+ ## Validation Metrics
16
+ loss: 4.248934268951416
17
+
18
+ rouge1: 25.7685
19
+
20
+ rouge2: 9.8226
21
+
22
+ rougeL: 24.6426
23
+
24
+ rougeLsum: 24.9756
25
+
26
+ gen_len: 61.8854
27
+
28
+ runtime: 700.1847
29
+
30
+ samples_per_second: 1.595
31
+
32
+ steps_per_second: 0.4
33
+
34
+ : 3.0
checkpoint-8938/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-large-cnn",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 12,
26
+ "eos_token_id": 2,
27
+ "force_bos_token_to_be_generated": true,
28
+ "forced_bos_token_id": 0,
29
+ "forced_eos_token_id": 2,
30
+ "gradient_checkpointing": false,
31
+ "id2label": {
32
+ "0": "LABEL_0",
33
+ "1": "LABEL_1",
34
+ "2": "LABEL_2"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1,
41
+ "LABEL_2": 2
42
+ },
43
+ "length_penalty": 2.0,
44
+ "max_length": 142,
45
+ "max_position_embeddings": 1024,
46
+ "min_length": 56,
47
+ "model_type": "bart",
48
+ "no_repeat_ngram_size": 3,
49
+ "normalize_before": false,
50
+ "num_beams": 4,
51
+ "num_hidden_layers": 12,
52
+ "output_past": true,
53
+ "pad_token_id": 1,
54
+ "prefix": " ",
55
+ "scale_embedding": false,
56
+ "task_specific_params": {
57
+ "summarization": {
58
+ "early_stopping": true,
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "no_repeat_ngram_size": 3,
63
+ "num_beams": 4
64
+ }
65
+ },
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.41.0",
68
+ "use_cache": false,
69
+ "vocab_size": 50265
70
+ }
checkpoint-8938/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "early_stopping": true,
5
+ "eos_token_id": 2,
6
+ "forced_bos_token_id": 0,
7
+ "forced_eos_token_id": 2,
8
+ "length_penalty": 2.0,
9
+ "max_length": 142,
10
+ "min_length": 56,
11
+ "no_repeat_ngram_size": 3,
12
+ "num_beams": 4,
13
+ "pad_token_id": 1,
14
+ "transformers_version": "4.41.0",
15
+ "use_cache": false
16
+ }
checkpoint-8938/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-8938/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd6fefb4de9b6731b7efcc997fdb1cc4db285111506952feaefad76b0cf7029
3
+ size 1625426996
checkpoint-8938/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27c884bb33254e02b91f137180d353c1a0f3e50ff043bb824e4235bf9eb7ffba
3
+ size 3250759951
checkpoint-8938/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38f1c8b673bdcc0876119e83cceb497451dd5a363c4b7203f158e58d54417987
3
+ size 14244
checkpoint-8938/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b2ea4d809759782474d493220a94a73376cbd4e942abfbb7ae792ca7a782093
3
+ size 1064
checkpoint-8938/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
checkpoint-8938/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-8938/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "BartTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
checkpoint-8938/trainer_state.json ADDED
@@ -0,0 +1,2567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 4.248934268951416,
3
+ "best_model_checkpoint": "autotrain-l6hey-orl0t/checkpoint-8938",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 8938,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005594092638174088,
13
+ "grad_norm": 34.332366943359375,
14
+ "learning_rate": 7.829977628635347e-07,
15
+ "loss": 6.4897,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.011188185276348177,
20
+ "grad_norm": 23.168872833251953,
21
+ "learning_rate": 1.7151379567486951e-06,
22
+ "loss": 6.3738,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.016782277914522265,
27
+ "grad_norm": 20.714399337768555,
28
+ "learning_rate": 2.6472781506338553e-06,
29
+ "loss": 5.5122,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.022376370552696354,
34
+ "grad_norm": 17.31004524230957,
35
+ "learning_rate": 3.542132736763609e-06,
36
+ "loss": 5.5352,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.027970463190870442,
41
+ "grad_norm": 20.792905807495117,
42
+ "learning_rate": 4.47427293064877e-06,
43
+ "loss": 5.3902,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.03356455582904453,
48
+ "grad_norm": 29.867664337158203,
49
+ "learning_rate": 5.40641312453393e-06,
50
+ "loss": 5.0552,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.039158648467218615,
55
+ "grad_norm": 17.143095016479492,
56
+ "learning_rate": 6.338553318419091e-06,
57
+ "loss": 5.0792,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.04475274110539271,
62
+ "grad_norm": 16.778640747070312,
63
+ "learning_rate": 7.270693512304251e-06,
64
+ "loss": 5.0412,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.05034683374356679,
69
+ "grad_norm": 21.074262619018555,
70
+ "learning_rate": 8.20283370618941e-06,
71
+ "loss": 4.8163,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 0.055940926381740884,
76
+ "grad_norm": 21.221630096435547,
77
+ "learning_rate": 9.134973900074571e-06,
78
+ "loss": 4.9409,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 0.06153501901991497,
83
+ "grad_norm": 21.189231872558594,
84
+ "learning_rate": 1.006711409395973e-05,
85
+ "loss": 5.0649,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 0.06712911165808906,
90
+ "grad_norm": 14.992874145507812,
91
+ "learning_rate": 1.0999254287844893e-05,
92
+ "loss": 4.7542,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 0.07272320429626315,
97
+ "grad_norm": 15.733444213867188,
98
+ "learning_rate": 1.1931394481730052e-05,
99
+ "loss": 4.5938,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 0.07831729693443723,
104
+ "grad_norm": 23.274600982666016,
105
+ "learning_rate": 1.2863534675615213e-05,
106
+ "loss": 4.6138,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.08391138957261132,
111
+ "grad_norm": 17.92934799194336,
112
+ "learning_rate": 1.3795674869500374e-05,
113
+ "loss": 4.8809,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 0.08950548221078541,
118
+ "grad_norm": 18.286834716796875,
119
+ "learning_rate": 1.4727815063385533e-05,
120
+ "loss": 4.6919,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.09509957484895949,
125
+ "grad_norm": 20.009130477905273,
126
+ "learning_rate": 1.5659955257270695e-05,
127
+ "loss": 4.7557,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 0.10069366748713358,
132
+ "grad_norm": 20.796175003051758,
133
+ "learning_rate": 1.6592095451155853e-05,
134
+ "loss": 4.8059,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 0.10628776012530768,
139
+ "grad_norm": 18.083595275878906,
140
+ "learning_rate": 1.7524235645041014e-05,
141
+ "loss": 4.86,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 0.11188185276348177,
146
+ "grad_norm": 17.57659339904785,
147
+ "learning_rate": 1.8456375838926178e-05,
148
+ "loss": 4.568,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 0.11747594540165585,
153
+ "grad_norm": 20.379024505615234,
154
+ "learning_rate": 1.9388516032811335e-05,
155
+ "loss": 4.9103,
156
+ "step": 525
157
+ },
158
+ {
159
+ "epoch": 0.12307003803982994,
160
+ "grad_norm": 16.093093872070312,
161
+ "learning_rate": 2.0320656226696496e-05,
162
+ "loss": 4.5315,
163
+ "step": 550
164
+ },
165
+ {
166
+ "epoch": 0.12866413067800403,
167
+ "grad_norm": 31.44219970703125,
168
+ "learning_rate": 2.1252796420581657e-05,
169
+ "loss": 4.6423,
170
+ "step": 575
171
+ },
172
+ {
173
+ "epoch": 0.13425822331617812,
174
+ "grad_norm": 16.757986068725586,
175
+ "learning_rate": 2.2184936614466818e-05,
176
+ "loss": 4.5965,
177
+ "step": 600
178
+ },
179
+ {
180
+ "epoch": 0.1398523159543522,
181
+ "grad_norm": 16.31531524658203,
182
+ "learning_rate": 2.311707680835198e-05,
183
+ "loss": 4.6928,
184
+ "step": 625
185
+ },
186
+ {
187
+ "epoch": 0.1454464085925263,
188
+ "grad_norm": 13.83728313446045,
189
+ "learning_rate": 2.4049217002237136e-05,
190
+ "loss": 4.5197,
191
+ "step": 650
192
+ },
193
+ {
194
+ "epoch": 0.15104050123070037,
195
+ "grad_norm": 17.00248146057129,
196
+ "learning_rate": 2.49813571961223e-05,
197
+ "loss": 4.4602,
198
+ "step": 675
199
+ },
200
+ {
201
+ "epoch": 0.15663459386887446,
202
+ "grad_norm": 22.1146183013916,
203
+ "learning_rate": 2.5913497390007457e-05,
204
+ "loss": 4.4573,
205
+ "step": 700
206
+ },
207
+ {
208
+ "epoch": 0.16222868650704855,
209
+ "grad_norm": 16.24863624572754,
210
+ "learning_rate": 2.6845637583892618e-05,
211
+ "loss": 4.6094,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 0.16782277914522264,
216
+ "grad_norm": 15.607491493225098,
217
+ "learning_rate": 2.777777777777778e-05,
218
+ "loss": 4.4851,
219
+ "step": 750
220
+ },
221
+ {
222
+ "epoch": 0.17341687178339674,
223
+ "grad_norm": 17.399606704711914,
224
+ "learning_rate": 2.8709917971662943e-05,
225
+ "loss": 4.7641,
226
+ "step": 775
227
+ },
228
+ {
229
+ "epoch": 0.17901096442157083,
230
+ "grad_norm": 12.437596321105957,
231
+ "learning_rate": 2.9642058165548097e-05,
232
+ "loss": 4.6065,
233
+ "step": 800
234
+ },
235
+ {
236
+ "epoch": 0.18460505705974492,
237
+ "grad_norm": 16.83686637878418,
238
+ "learning_rate": 3.057419835943326e-05,
239
+ "loss": 4.471,
240
+ "step": 825
241
+ },
242
+ {
243
+ "epoch": 0.19019914969791898,
244
+ "grad_norm": 17.172122955322266,
245
+ "learning_rate": 3.150633855331842e-05,
246
+ "loss": 4.7266,
247
+ "step": 850
248
+ },
249
+ {
250
+ "epoch": 0.19579324233609308,
251
+ "grad_norm": 18.03239631652832,
252
+ "learning_rate": 3.243847874720358e-05,
253
+ "loss": 4.46,
254
+ "step": 875
255
+ },
256
+ {
257
+ "epoch": 0.20138733497426717,
258
+ "grad_norm": 11.44616985321045,
259
+ "learning_rate": 3.3370618941088744e-05,
260
+ "loss": 4.2646,
261
+ "step": 900
262
+ },
263
+ {
264
+ "epoch": 0.20698142761244126,
265
+ "grad_norm": 18.69893455505371,
266
+ "learning_rate": 3.43027591349739e-05,
267
+ "loss": 4.5796,
268
+ "step": 925
269
+ },
270
+ {
271
+ "epoch": 0.21257552025061535,
272
+ "grad_norm": 26.265470504760742,
273
+ "learning_rate": 3.523489932885906e-05,
274
+ "loss": 4.476,
275
+ "step": 950
276
+ },
277
+ {
278
+ "epoch": 0.21816961288878944,
279
+ "grad_norm": 28.28611946105957,
280
+ "learning_rate": 3.616703952274422e-05,
281
+ "loss": 4.9502,
282
+ "step": 975
283
+ },
284
+ {
285
+ "epoch": 0.22376370552696354,
286
+ "grad_norm": 20.53813362121582,
287
+ "learning_rate": 3.709917971662939e-05,
288
+ "loss": 4.5447,
289
+ "step": 1000
290
+ },
291
+ {
292
+ "epoch": 0.22935779816513763,
293
+ "grad_norm": 14.012741088867188,
294
+ "learning_rate": 3.8031319910514545e-05,
295
+ "loss": 4.7348,
296
+ "step": 1025
297
+ },
298
+ {
299
+ "epoch": 0.2349518908033117,
300
+ "grad_norm": 15.572279930114746,
301
+ "learning_rate": 3.89634601043997e-05,
302
+ "loss": 4.7246,
303
+ "step": 1050
304
+ },
305
+ {
306
+ "epoch": 0.24054598344148578,
307
+ "grad_norm": 13.881638526916504,
308
+ "learning_rate": 3.9895600298284866e-05,
309
+ "loss": 4.6181,
310
+ "step": 1075
311
+ },
312
+ {
313
+ "epoch": 0.24614007607965988,
314
+ "grad_norm": 12.749186515808105,
315
+ "learning_rate": 4.0827740492170024e-05,
316
+ "loss": 4.3893,
317
+ "step": 1100
318
+ },
319
+ {
320
+ "epoch": 0.251734168717834,
321
+ "grad_norm": 17.131837844848633,
322
+ "learning_rate": 4.175988068605519e-05,
323
+ "loss": 4.3859,
324
+ "step": 1125
325
+ },
326
+ {
327
+ "epoch": 0.25732826135600806,
328
+ "grad_norm": 13.731075286865234,
329
+ "learning_rate": 4.2692020879940345e-05,
330
+ "loss": 4.4761,
331
+ "step": 1150
332
+ },
333
+ {
334
+ "epoch": 0.2629223539941821,
335
+ "grad_norm": 15.064730644226074,
336
+ "learning_rate": 4.36241610738255e-05,
337
+ "loss": 4.4364,
338
+ "step": 1175
339
+ },
340
+ {
341
+ "epoch": 0.26851644663235624,
342
+ "grad_norm": 13.714836120605469,
343
+ "learning_rate": 4.455630126771067e-05,
344
+ "loss": 4.5969,
345
+ "step": 1200
346
+ },
347
+ {
348
+ "epoch": 0.2741105392705303,
349
+ "grad_norm": 14.1889009475708,
350
+ "learning_rate": 4.5488441461595824e-05,
351
+ "loss": 4.9771,
352
+ "step": 1225
353
+ },
354
+ {
355
+ "epoch": 0.2797046319087044,
356
+ "grad_norm": 14.543293952941895,
357
+ "learning_rate": 4.642058165548099e-05,
358
+ "loss": 4.6537,
359
+ "step": 1250
360
+ },
361
+ {
362
+ "epoch": 0.2852987245468785,
363
+ "grad_norm": 18.1214542388916,
364
+ "learning_rate": 4.735272184936615e-05,
365
+ "loss": 4.6208,
366
+ "step": 1275
367
+ },
368
+ {
369
+ "epoch": 0.2908928171850526,
370
+ "grad_norm": 13.347675323486328,
371
+ "learning_rate": 4.82848620432513e-05,
372
+ "loss": 4.6199,
373
+ "step": 1300
374
+ },
375
+ {
376
+ "epoch": 0.2964869098232267,
377
+ "grad_norm": 17.083444595336914,
378
+ "learning_rate": 4.921700223713647e-05,
379
+ "loss": 4.8805,
380
+ "step": 1325
381
+ },
382
+ {
383
+ "epoch": 0.30208100246140074,
384
+ "grad_norm": 24.343929290771484,
385
+ "learning_rate": 4.998342449859108e-05,
386
+ "loss": 4.6357,
387
+ "step": 1350
388
+ },
389
+ {
390
+ "epoch": 0.30767509509957486,
391
+ "grad_norm": 16.355289459228516,
392
+ "learning_rate": 4.987982761478535e-05,
393
+ "loss": 4.674,
394
+ "step": 1375
395
+ },
396
+ {
397
+ "epoch": 0.3132691877377489,
398
+ "grad_norm": 20.73784828186035,
399
+ "learning_rate": 4.978037460633184e-05,
400
+ "loss": 4.6211,
401
+ "step": 1400
402
+ },
403
+ {
404
+ "epoch": 0.31886328037592304,
405
+ "grad_norm": 13.50486946105957,
406
+ "learning_rate": 4.9676777722526106e-05,
407
+ "loss": 4.7257,
408
+ "step": 1425
409
+ },
410
+ {
411
+ "epoch": 0.3244573730140971,
412
+ "grad_norm": 14.019612312316895,
413
+ "learning_rate": 4.9573180838720376e-05,
414
+ "loss": 4.6509,
415
+ "step": 1450
416
+ },
417
+ {
418
+ "epoch": 0.3300514656522712,
419
+ "grad_norm": 18.581026077270508,
420
+ "learning_rate": 4.946958395491464e-05,
421
+ "loss": 4.3779,
422
+ "step": 1475
423
+ },
424
+ {
425
+ "epoch": 0.3356455582904453,
426
+ "grad_norm": 12.192296981811523,
427
+ "learning_rate": 4.93659870711089e-05,
428
+ "loss": 4.2878,
429
+ "step": 1500
430
+ },
431
+ {
432
+ "epoch": 0.34123965092861935,
433
+ "grad_norm": 12.066230773925781,
434
+ "learning_rate": 4.926239018730317e-05,
435
+ "loss": 4.5682,
436
+ "step": 1525
437
+ },
438
+ {
439
+ "epoch": 0.3468337435667935,
440
+ "grad_norm": 10.833414077758789,
441
+ "learning_rate": 4.9158793303497436e-05,
442
+ "loss": 4.1772,
443
+ "step": 1550
444
+ },
445
+ {
446
+ "epoch": 0.35242783620496754,
447
+ "grad_norm": 10.834297180175781,
448
+ "learning_rate": 4.905519641969169e-05,
449
+ "loss": 4.6879,
450
+ "step": 1575
451
+ },
452
+ {
453
+ "epoch": 0.35802192884314166,
454
+ "grad_norm": 12.870365142822266,
455
+ "learning_rate": 4.895159953588596e-05,
456
+ "loss": 4.7267,
457
+ "step": 1600
458
+ },
459
+ {
460
+ "epoch": 0.3636160214813157,
461
+ "grad_norm": 10.991371154785156,
462
+ "learning_rate": 4.8848002652080226e-05,
463
+ "loss": 4.6822,
464
+ "step": 1625
465
+ },
466
+ {
467
+ "epoch": 0.36921011411948984,
468
+ "grad_norm": 14.67126178741455,
469
+ "learning_rate": 4.874440576827449e-05,
470
+ "loss": 4.3277,
471
+ "step": 1650
472
+ },
473
+ {
474
+ "epoch": 0.3748042067576639,
475
+ "grad_norm": 13.142556190490723,
476
+ "learning_rate": 4.864080888446876e-05,
477
+ "loss": 4.513,
478
+ "step": 1675
479
+ },
480
+ {
481
+ "epoch": 0.38039829939583797,
482
+ "grad_norm": 12.647134780883789,
483
+ "learning_rate": 4.853721200066302e-05,
484
+ "loss": 4.6069,
485
+ "step": 1700
486
+ },
487
+ {
488
+ "epoch": 0.3859923920340121,
489
+ "grad_norm": 12.48798942565918,
490
+ "learning_rate": 4.8433615116857286e-05,
491
+ "loss": 4.846,
492
+ "step": 1725
493
+ },
494
+ {
495
+ "epoch": 0.39158648467218615,
496
+ "grad_norm": 10.90004825592041,
497
+ "learning_rate": 4.833001823305155e-05,
498
+ "loss": 4.4091,
499
+ "step": 1750
500
+ },
501
+ {
502
+ "epoch": 0.39718057731036027,
503
+ "grad_norm": 14.13915729522705,
504
+ "learning_rate": 4.822642134924581e-05,
505
+ "loss": 4.4192,
506
+ "step": 1775
507
+ },
508
+ {
509
+ "epoch": 0.40277466994853434,
510
+ "grad_norm": 13.102397918701172,
511
+ "learning_rate": 4.812282446544008e-05,
512
+ "loss": 4.5444,
513
+ "step": 1800
514
+ },
515
+ {
516
+ "epoch": 0.40836876258670846,
517
+ "grad_norm": 12.227831840515137,
518
+ "learning_rate": 4.8019227581634346e-05,
519
+ "loss": 4.712,
520
+ "step": 1825
521
+ },
522
+ {
523
+ "epoch": 0.4139628552248825,
524
+ "grad_norm": 15.215840339660645,
525
+ "learning_rate": 4.791563069782861e-05,
526
+ "loss": 4.6632,
527
+ "step": 1850
528
+ },
529
+ {
530
+ "epoch": 0.41955694786305664,
531
+ "grad_norm": 18.023183822631836,
532
+ "learning_rate": 4.781203381402288e-05,
533
+ "loss": 4.1778,
534
+ "step": 1875
535
+ },
536
+ {
537
+ "epoch": 0.4251510405012307,
538
+ "grad_norm": 13.230670928955078,
539
+ "learning_rate": 4.770843693021714e-05,
540
+ "loss": 4.4197,
541
+ "step": 1900
542
+ },
543
+ {
544
+ "epoch": 0.43074513313940477,
545
+ "grad_norm": 14.070335388183594,
546
+ "learning_rate": 4.7604840046411407e-05,
547
+ "loss": 4.6564,
548
+ "step": 1925
549
+ },
550
+ {
551
+ "epoch": 0.4363392257775789,
552
+ "grad_norm": 20.313472747802734,
553
+ "learning_rate": 4.750124316260567e-05,
554
+ "loss": 4.5667,
555
+ "step": 1950
556
+ },
557
+ {
558
+ "epoch": 0.44193331841575295,
559
+ "grad_norm": 15.953713417053223,
560
+ "learning_rate": 4.739764627879993e-05,
561
+ "loss": 4.5235,
562
+ "step": 1975
563
+ },
564
+ {
565
+ "epoch": 0.44752741105392707,
566
+ "grad_norm": 10.95453929901123,
567
+ "learning_rate": 4.7294049394994197e-05,
568
+ "loss": 4.5047,
569
+ "step": 2000
570
+ },
571
+ {
572
+ "epoch": 0.45312150369210114,
573
+ "grad_norm": 15.660309791564941,
574
+ "learning_rate": 4.719045251118847e-05,
575
+ "loss": 4.1537,
576
+ "step": 2025
577
+ },
578
+ {
579
+ "epoch": 0.45871559633027525,
580
+ "grad_norm": 11.03080940246582,
581
+ "learning_rate": 4.708685562738273e-05,
582
+ "loss": 4.3448,
583
+ "step": 2050
584
+ },
585
+ {
586
+ "epoch": 0.4643096889684493,
587
+ "grad_norm": 10.359949111938477,
588
+ "learning_rate": 4.698325874357699e-05,
589
+ "loss": 4.3634,
590
+ "step": 2075
591
+ },
592
+ {
593
+ "epoch": 0.4699037816066234,
594
+ "grad_norm": 25.401718139648438,
595
+ "learning_rate": 4.6879661859771263e-05,
596
+ "loss": 4.2735,
597
+ "step": 2100
598
+ },
599
+ {
600
+ "epoch": 0.4754978742447975,
601
+ "grad_norm": 14.679646492004395,
602
+ "learning_rate": 4.677606497596553e-05,
603
+ "loss": 4.4465,
604
+ "step": 2125
605
+ },
606
+ {
607
+ "epoch": 0.48109196688297157,
608
+ "grad_norm": 11.817214965820312,
609
+ "learning_rate": 4.667246809215979e-05,
610
+ "loss": 4.629,
611
+ "step": 2150
612
+ },
613
+ {
614
+ "epoch": 0.4866860595211457,
615
+ "grad_norm": 10.622258186340332,
616
+ "learning_rate": 4.656887120835405e-05,
617
+ "loss": 4.4189,
618
+ "step": 2175
619
+ },
620
+ {
621
+ "epoch": 0.49228015215931975,
622
+ "grad_norm": 15.188981056213379,
623
+ "learning_rate": 4.646527432454832e-05,
624
+ "loss": 4.2514,
625
+ "step": 2200
626
+ },
627
+ {
628
+ "epoch": 0.49787424479749387,
629
+ "grad_norm": 14.326010704040527,
630
+ "learning_rate": 4.636167744074259e-05,
631
+ "loss": 4.3532,
632
+ "step": 2225
633
+ },
634
+ {
635
+ "epoch": 0.503468337435668,
636
+ "grad_norm": 16.31020164489746,
637
+ "learning_rate": 4.625808055693685e-05,
638
+ "loss": 4.2333,
639
+ "step": 2250
640
+ },
641
+ {
642
+ "epoch": 0.509062430073842,
643
+ "grad_norm": 18.346088409423828,
644
+ "learning_rate": 4.6154483673131113e-05,
645
+ "loss": 4.4483,
646
+ "step": 2275
647
+ },
648
+ {
649
+ "epoch": 0.5146565227120161,
650
+ "grad_norm": 15.169132232666016,
651
+ "learning_rate": 4.6050886789325384e-05,
652
+ "loss": 4.3814,
653
+ "step": 2300
654
+ },
655
+ {
656
+ "epoch": 0.5202506153501902,
657
+ "grad_norm": 10.011805534362793,
658
+ "learning_rate": 4.594728990551964e-05,
659
+ "loss": 4.6235,
660
+ "step": 2325
661
+ },
662
+ {
663
+ "epoch": 0.5258447079883642,
664
+ "grad_norm": 11.23599624633789,
665
+ "learning_rate": 4.584369302171391e-05,
666
+ "loss": 4.5861,
667
+ "step": 2350
668
+ },
669
+ {
670
+ "epoch": 0.5314388006265384,
671
+ "grad_norm": 10.484898567199707,
672
+ "learning_rate": 4.5740096137908174e-05,
673
+ "loss": 4.3894,
674
+ "step": 2375
675
+ },
676
+ {
677
+ "epoch": 0.5370328932647125,
678
+ "grad_norm": 12.192333221435547,
679
+ "learning_rate": 4.563649925410244e-05,
680
+ "loss": 4.448,
681
+ "step": 2400
682
+ },
683
+ {
684
+ "epoch": 0.5426269859028866,
685
+ "grad_norm": 15.25631332397461,
686
+ "learning_rate": 4.553290237029671e-05,
687
+ "loss": 4.3787,
688
+ "step": 2425
689
+ },
690
+ {
691
+ "epoch": 0.5482210785410606,
692
+ "grad_norm": 10.27658748626709,
693
+ "learning_rate": 4.542930548649097e-05,
694
+ "loss": 4.4333,
695
+ "step": 2450
696
+ },
697
+ {
698
+ "epoch": 0.5538151711792347,
699
+ "grad_norm": 16.127513885498047,
700
+ "learning_rate": 4.5325708602685234e-05,
701
+ "loss": 4.1792,
702
+ "step": 2475
703
+ },
704
+ {
705
+ "epoch": 0.5594092638174089,
706
+ "grad_norm": 10.214879035949707,
707
+ "learning_rate": 4.5222111718879504e-05,
708
+ "loss": 4.6276,
709
+ "step": 2500
710
+ },
711
+ {
712
+ "epoch": 0.5650033564555829,
713
+ "grad_norm": 10.60392951965332,
714
+ "learning_rate": 4.511851483507376e-05,
715
+ "loss": 4.5925,
716
+ "step": 2525
717
+ },
718
+ {
719
+ "epoch": 0.570597449093757,
720
+ "grad_norm": 11.073285102844238,
721
+ "learning_rate": 4.5014917951268024e-05,
722
+ "loss": 4.4202,
723
+ "step": 2550
724
+ },
725
+ {
726
+ "epoch": 0.5761915417319311,
727
+ "grad_norm": 14.709641456604004,
728
+ "learning_rate": 4.4911321067462294e-05,
729
+ "loss": 4.2758,
730
+ "step": 2575
731
+ },
732
+ {
733
+ "epoch": 0.5817856343701052,
734
+ "grad_norm": 12.901021957397461,
735
+ "learning_rate": 4.480772418365656e-05,
736
+ "loss": 4.3067,
737
+ "step": 2600
738
+ },
739
+ {
740
+ "epoch": 0.5873797270082792,
741
+ "grad_norm": 11.27915096282959,
742
+ "learning_rate": 4.470412729985082e-05,
743
+ "loss": 4.0111,
744
+ "step": 2625
745
+ },
746
+ {
747
+ "epoch": 0.5929738196464533,
748
+ "grad_norm": 15.488706588745117,
749
+ "learning_rate": 4.460053041604509e-05,
750
+ "loss": 4.2671,
751
+ "step": 2650
752
+ },
753
+ {
754
+ "epoch": 0.5985679122846275,
755
+ "grad_norm": 13.603848457336426,
756
+ "learning_rate": 4.4496933532239354e-05,
757
+ "loss": 4.2589,
758
+ "step": 2675
759
+ },
760
+ {
761
+ "epoch": 0.6041620049228015,
762
+ "grad_norm": 12.827610969543457,
763
+ "learning_rate": 4.439333664843362e-05,
764
+ "loss": 4.6476,
765
+ "step": 2700
766
+ },
767
+ {
768
+ "epoch": 0.6097560975609756,
769
+ "grad_norm": 16.806106567382812,
770
+ "learning_rate": 4.428973976462788e-05,
771
+ "loss": 4.6188,
772
+ "step": 2725
773
+ },
774
+ {
775
+ "epoch": 0.6153501901991497,
776
+ "grad_norm": 14.598794937133789,
777
+ "learning_rate": 4.4186142880822144e-05,
778
+ "loss": 4.4195,
779
+ "step": 2750
780
+ },
781
+ {
782
+ "epoch": 0.6209442828373238,
783
+ "grad_norm": 10.380790710449219,
784
+ "learning_rate": 4.4082545997016414e-05,
785
+ "loss": 4.4634,
786
+ "step": 2775
787
+ },
788
+ {
789
+ "epoch": 0.6265383754754978,
790
+ "grad_norm": 12.62149715423584,
791
+ "learning_rate": 4.397894911321068e-05,
792
+ "loss": 4.4751,
793
+ "step": 2800
794
+ },
795
+ {
796
+ "epoch": 0.632132468113672,
797
+ "grad_norm": 10.467231750488281,
798
+ "learning_rate": 4.387535222940494e-05,
799
+ "loss": 4.2898,
800
+ "step": 2825
801
+ },
802
+ {
803
+ "epoch": 0.6377265607518461,
804
+ "grad_norm": 12.244780540466309,
805
+ "learning_rate": 4.377175534559921e-05,
806
+ "loss": 4.3242,
807
+ "step": 2850
808
+ },
809
+ {
810
+ "epoch": 0.6433206533900201,
811
+ "grad_norm": 12.39667797088623,
812
+ "learning_rate": 4.3668158461793474e-05,
813
+ "loss": 4.2754,
814
+ "step": 2875
815
+ },
816
+ {
817
+ "epoch": 0.6489147460281942,
818
+ "grad_norm": 12.747861862182617,
819
+ "learning_rate": 4.356456157798773e-05,
820
+ "loss": 4.286,
821
+ "step": 2900
822
+ },
823
+ {
824
+ "epoch": 0.6545088386663683,
825
+ "grad_norm": 22.809650421142578,
826
+ "learning_rate": 4.3460964694182e-05,
827
+ "loss": 4.3482,
828
+ "step": 2925
829
+ },
830
+ {
831
+ "epoch": 0.6601029313045425,
832
+ "grad_norm": 10.659783363342285,
833
+ "learning_rate": 4.3357367810376264e-05,
834
+ "loss": 4.2066,
835
+ "step": 2950
836
+ },
837
+ {
838
+ "epoch": 0.6656970239427165,
839
+ "grad_norm": 14.72028636932373,
840
+ "learning_rate": 4.325377092657053e-05,
841
+ "loss": 4.4523,
842
+ "step": 2975
843
+ },
844
+ {
845
+ "epoch": 0.6712911165808906,
846
+ "grad_norm": 10.166138648986816,
847
+ "learning_rate": 4.31501740427648e-05,
848
+ "loss": 4.3615,
849
+ "step": 3000
850
+ },
851
+ {
852
+ "epoch": 0.6768852092190647,
853
+ "grad_norm": 12.992719650268555,
854
+ "learning_rate": 4.304657715895906e-05,
855
+ "loss": 4.5061,
856
+ "step": 3025
857
+ },
858
+ {
859
+ "epoch": 0.6824793018572387,
860
+ "grad_norm": 11.16627311706543,
861
+ "learning_rate": 4.2942980275153324e-05,
862
+ "loss": 4.5588,
863
+ "step": 3050
864
+ },
865
+ {
866
+ "epoch": 0.6880733944954128,
867
+ "grad_norm": 10.317902565002441,
868
+ "learning_rate": 4.2839383391347594e-05,
869
+ "loss": 4.3462,
870
+ "step": 3075
871
+ },
872
+ {
873
+ "epoch": 0.693667487133587,
874
+ "grad_norm": 12.662385940551758,
875
+ "learning_rate": 4.273578650754185e-05,
876
+ "loss": 4.5361,
877
+ "step": 3100
878
+ },
879
+ {
880
+ "epoch": 0.6992615797717611,
881
+ "grad_norm": 16.921144485473633,
882
+ "learning_rate": 4.263218962373612e-05,
883
+ "loss": 4.2344,
884
+ "step": 3125
885
+ },
886
+ {
887
+ "epoch": 0.7048556724099351,
888
+ "grad_norm": 17.665006637573242,
889
+ "learning_rate": 4.2528592739930384e-05,
890
+ "loss": 4.2197,
891
+ "step": 3150
892
+ },
893
+ {
894
+ "epoch": 0.7104497650481092,
895
+ "grad_norm": 9.232120513916016,
896
+ "learning_rate": 4.242499585612465e-05,
897
+ "loss": 4.511,
898
+ "step": 3175
899
+ },
900
+ {
901
+ "epoch": 0.7160438576862833,
902
+ "grad_norm": 12.514689445495605,
903
+ "learning_rate": 4.232139897231892e-05,
904
+ "loss": 4.5251,
905
+ "step": 3200
906
+ },
907
+ {
908
+ "epoch": 0.7216379503244573,
909
+ "grad_norm": 11.398234367370605,
910
+ "learning_rate": 4.221780208851318e-05,
911
+ "loss": 4.1084,
912
+ "step": 3225
913
+ },
914
+ {
915
+ "epoch": 0.7272320429626314,
916
+ "grad_norm": 8.08095932006836,
917
+ "learning_rate": 4.2114205204707444e-05,
918
+ "loss": 4.1601,
919
+ "step": 3250
920
+ },
921
+ {
922
+ "epoch": 0.7328261356008056,
923
+ "grad_norm": 12.109641075134277,
924
+ "learning_rate": 4.2010608320901714e-05,
925
+ "loss": 4.274,
926
+ "step": 3275
927
+ },
928
+ {
929
+ "epoch": 0.7384202282389797,
930
+ "grad_norm": 10.819121360778809,
931
+ "learning_rate": 4.190701143709597e-05,
932
+ "loss": 4.5686,
933
+ "step": 3300
934
+ },
935
+ {
936
+ "epoch": 0.7440143208771537,
937
+ "grad_norm": 11.090829849243164,
938
+ "learning_rate": 4.180341455329024e-05,
939
+ "loss": 4.5401,
940
+ "step": 3325
941
+ },
942
+ {
943
+ "epoch": 0.7496084135153278,
944
+ "grad_norm": 11.24759578704834,
945
+ "learning_rate": 4.1699817669484504e-05,
946
+ "loss": 4.4797,
947
+ "step": 3350
948
+ },
949
+ {
950
+ "epoch": 0.7552025061535019,
951
+ "grad_norm": 10.916013717651367,
952
+ "learning_rate": 4.159622078567877e-05,
953
+ "loss": 4.1258,
954
+ "step": 3375
955
+ },
956
+ {
957
+ "epoch": 0.7607965987916759,
958
+ "grad_norm": 11.953822135925293,
959
+ "learning_rate": 4.149262390187304e-05,
960
+ "loss": 4.3313,
961
+ "step": 3400
962
+ },
963
+ {
964
+ "epoch": 0.7663906914298501,
965
+ "grad_norm": 16.665861129760742,
966
+ "learning_rate": 4.13890270180673e-05,
967
+ "loss": 4.2789,
968
+ "step": 3425
969
+ },
970
+ {
971
+ "epoch": 0.7719847840680242,
972
+ "grad_norm": 11.539497375488281,
973
+ "learning_rate": 4.1285430134261564e-05,
974
+ "loss": 4.5436,
975
+ "step": 3450
976
+ },
977
+ {
978
+ "epoch": 0.7775788767061983,
979
+ "grad_norm": 11.955995559692383,
980
+ "learning_rate": 4.118183325045583e-05,
981
+ "loss": 4.1149,
982
+ "step": 3475
983
+ },
984
+ {
985
+ "epoch": 0.7831729693443723,
986
+ "grad_norm": 15.087596893310547,
987
+ "learning_rate": 4.107823636665009e-05,
988
+ "loss": 4.3523,
989
+ "step": 3500
990
+ },
991
+ {
992
+ "epoch": 0.7887670619825464,
993
+ "grad_norm": 14.733497619628906,
994
+ "learning_rate": 4.0974639482844354e-05,
995
+ "loss": 4.2913,
996
+ "step": 3525
997
+ },
998
+ {
999
+ "epoch": 0.7943611546207205,
1000
+ "grad_norm": 10.125676155090332,
1001
+ "learning_rate": 4.0871042599038624e-05,
1002
+ "loss": 4.3078,
1003
+ "step": 3550
1004
+ },
1005
+ {
1006
+ "epoch": 0.7999552472588946,
1007
+ "grad_norm": 11.222993850708008,
1008
+ "learning_rate": 4.076744571523289e-05,
1009
+ "loss": 4.4803,
1010
+ "step": 3575
1011
+ },
1012
+ {
1013
+ "epoch": 0.8055493398970687,
1014
+ "grad_norm": 10.871453285217285,
1015
+ "learning_rate": 4.066384883142715e-05,
1016
+ "loss": 4.5347,
1017
+ "step": 3600
1018
+ },
1019
+ {
1020
+ "epoch": 0.8111434325352428,
1021
+ "grad_norm": 10.571527481079102,
1022
+ "learning_rate": 4.056025194762142e-05,
1023
+ "loss": 4.1658,
1024
+ "step": 3625
1025
+ },
1026
+ {
1027
+ "epoch": 0.8167375251734169,
1028
+ "grad_norm": 12.036689758300781,
1029
+ "learning_rate": 4.0456655063815685e-05,
1030
+ "loss": 4.3284,
1031
+ "step": 3650
1032
+ },
1033
+ {
1034
+ "epoch": 0.8223316178115909,
1035
+ "grad_norm": 13.614919662475586,
1036
+ "learning_rate": 4.035305818000995e-05,
1037
+ "loss": 4.1228,
1038
+ "step": 3675
1039
+ },
1040
+ {
1041
+ "epoch": 0.827925710449765,
1042
+ "grad_norm": 12.302602767944336,
1043
+ "learning_rate": 4.024946129620421e-05,
1044
+ "loss": 4.0459,
1045
+ "step": 3700
1046
+ },
1047
+ {
1048
+ "epoch": 0.8335198030879392,
1049
+ "grad_norm": 13.102405548095703,
1050
+ "learning_rate": 4.0145864412398474e-05,
1051
+ "loss": 4.4641,
1052
+ "step": 3725
1053
+ },
1054
+ {
1055
+ "epoch": 0.8391138957261133,
1056
+ "grad_norm": 12.71800422668457,
1057
+ "learning_rate": 4.0042267528592745e-05,
1058
+ "loss": 4.206,
1059
+ "step": 3750
1060
+ },
1061
+ {
1062
+ "epoch": 0.8447079883642873,
1063
+ "grad_norm": 13.687782287597656,
1064
+ "learning_rate": 3.993867064478701e-05,
1065
+ "loss": 4.388,
1066
+ "step": 3775
1067
+ },
1068
+ {
1069
+ "epoch": 0.8503020810024614,
1070
+ "grad_norm": 16.609664916992188,
1071
+ "learning_rate": 3.983507376098127e-05,
1072
+ "loss": 4.1664,
1073
+ "step": 3800
1074
+ },
1075
+ {
1076
+ "epoch": 0.8558961736406355,
1077
+ "grad_norm": 16.731786727905273,
1078
+ "learning_rate": 3.973147687717554e-05,
1079
+ "loss": 4.3956,
1080
+ "step": 3825
1081
+ },
1082
+ {
1083
+ "epoch": 0.8614902662788095,
1084
+ "grad_norm": 17.152843475341797,
1085
+ "learning_rate": 3.96278799933698e-05,
1086
+ "loss": 4.3338,
1087
+ "step": 3850
1088
+ },
1089
+ {
1090
+ "epoch": 0.8670843589169837,
1091
+ "grad_norm": 14.199774742126465,
1092
+ "learning_rate": 3.952428310956406e-05,
1093
+ "loss": 4.4609,
1094
+ "step": 3875
1095
+ },
1096
+ {
1097
+ "epoch": 0.8726784515551578,
1098
+ "grad_norm": 11.605820655822754,
1099
+ "learning_rate": 3.942068622575833e-05,
1100
+ "loss": 4.1688,
1101
+ "step": 3900
1102
+ },
1103
+ {
1104
+ "epoch": 0.8782725441933319,
1105
+ "grad_norm": 11.608319282531738,
1106
+ "learning_rate": 3.9317089341952595e-05,
1107
+ "loss": 4.2122,
1108
+ "step": 3925
1109
+ },
1110
+ {
1111
+ "epoch": 0.8838666368315059,
1112
+ "grad_norm": 12.212512016296387,
1113
+ "learning_rate": 3.921349245814686e-05,
1114
+ "loss": 4.2482,
1115
+ "step": 3950
1116
+ },
1117
+ {
1118
+ "epoch": 0.88946072946968,
1119
+ "grad_norm": 12.425273895263672,
1120
+ "learning_rate": 3.910989557434113e-05,
1121
+ "loss": 4.5128,
1122
+ "step": 3975
1123
+ },
1124
+ {
1125
+ "epoch": 0.8950548221078541,
1126
+ "grad_norm": 14.292542457580566,
1127
+ "learning_rate": 3.900629869053539e-05,
1128
+ "loss": 4.5421,
1129
+ "step": 4000
1130
+ },
1131
+ {
1132
+ "epoch": 0.9006489147460282,
1133
+ "grad_norm": 9.911199569702148,
1134
+ "learning_rate": 3.8902701806729655e-05,
1135
+ "loss": 4.3237,
1136
+ "step": 4025
1137
+ },
1138
+ {
1139
+ "epoch": 0.9062430073842023,
1140
+ "grad_norm": 6.303875923156738,
1141
+ "learning_rate": 3.879910492292392e-05,
1142
+ "loss": 4.195,
1143
+ "step": 4050
1144
+ },
1145
+ {
1146
+ "epoch": 0.9118371000223764,
1147
+ "grad_norm": 8.433326721191406,
1148
+ "learning_rate": 3.869550803911818e-05,
1149
+ "loss": 4.1758,
1150
+ "step": 4075
1151
+ },
1152
+ {
1153
+ "epoch": 0.9174311926605505,
1154
+ "grad_norm": 12.792257308959961,
1155
+ "learning_rate": 3.859191115531245e-05,
1156
+ "loss": 4.1538,
1157
+ "step": 4100
1158
+ },
1159
+ {
1160
+ "epoch": 0.9230252852987245,
1161
+ "grad_norm": 10.63804817199707,
1162
+ "learning_rate": 3.8488314271506715e-05,
1163
+ "loss": 4.2415,
1164
+ "step": 4125
1165
+ },
1166
+ {
1167
+ "epoch": 0.9286193779368986,
1168
+ "grad_norm": 10.244176864624023,
1169
+ "learning_rate": 3.838471738770098e-05,
1170
+ "loss": 4.624,
1171
+ "step": 4150
1172
+ },
1173
+ {
1174
+ "epoch": 0.9342134705750728,
1175
+ "grad_norm": 14.590502738952637,
1176
+ "learning_rate": 3.828112050389525e-05,
1177
+ "loss": 4.261,
1178
+ "step": 4175
1179
+ },
1180
+ {
1181
+ "epoch": 0.9398075632132468,
1182
+ "grad_norm": 17.149826049804688,
1183
+ "learning_rate": 3.817752362008951e-05,
1184
+ "loss": 4.0573,
1185
+ "step": 4200
1186
+ },
1187
+ {
1188
+ "epoch": 0.9454016558514209,
1189
+ "grad_norm": 10.837606430053711,
1190
+ "learning_rate": 3.8073926736283775e-05,
1191
+ "loss": 4.2502,
1192
+ "step": 4225
1193
+ },
1194
+ {
1195
+ "epoch": 0.950995748489595,
1196
+ "grad_norm": 13.960970878601074,
1197
+ "learning_rate": 3.797032985247804e-05,
1198
+ "loss": 4.1431,
1199
+ "step": 4250
1200
+ },
1201
+ {
1202
+ "epoch": 0.9565898411277691,
1203
+ "grad_norm": 10.603372573852539,
1204
+ "learning_rate": 3.78667329686723e-05,
1205
+ "loss": 4.2805,
1206
+ "step": 4275
1207
+ },
1208
+ {
1209
+ "epoch": 0.9621839337659431,
1210
+ "grad_norm": 14.068360328674316,
1211
+ "learning_rate": 3.776313608486657e-05,
1212
+ "loss": 4.2771,
1213
+ "step": 4300
1214
+ },
1215
+ {
1216
+ "epoch": 0.9677780264041173,
1217
+ "grad_norm": 12.487285614013672,
1218
+ "learning_rate": 3.7659539201060835e-05,
1219
+ "loss": 4.4749,
1220
+ "step": 4325
1221
+ },
1222
+ {
1223
+ "epoch": 0.9733721190422914,
1224
+ "grad_norm": 11.214025497436523,
1225
+ "learning_rate": 3.75559423172551e-05,
1226
+ "loss": 4.1715,
1227
+ "step": 4350
1228
+ },
1229
+ {
1230
+ "epoch": 0.9789662116804654,
1231
+ "grad_norm": 12.117270469665527,
1232
+ "learning_rate": 3.745234543344937e-05,
1233
+ "loss": 4.1669,
1234
+ "step": 4375
1235
+ },
1236
+ {
1237
+ "epoch": 0.9845603043186395,
1238
+ "grad_norm": 7.657718181610107,
1239
+ "learning_rate": 3.734874854964363e-05,
1240
+ "loss": 4.1935,
1241
+ "step": 4400
1242
+ },
1243
+ {
1244
+ "epoch": 0.9901543969568136,
1245
+ "grad_norm": 12.381430625915527,
1246
+ "learning_rate": 3.724929554119012e-05,
1247
+ "loss": 4.3682,
1248
+ "step": 4425
1249
+ },
1250
+ {
1251
+ "epoch": 0.9957484895949877,
1252
+ "grad_norm": 12.749862670898438,
1253
+ "learning_rate": 3.7145698657384385e-05,
1254
+ "loss": 4.2554,
1255
+ "step": 4450
1256
+ },
1257
+ {
1258
+ "epoch": 1.0,
1259
+ "eval_gen_len": 67.1128,
1260
+ "eval_loss": 4.290805816650391,
1261
+ "eval_rouge1": 26.1327,
1262
+ "eval_rouge2": 10.0836,
1263
+ "eval_rougeL": 24.9862,
1264
+ "eval_rougeLsum": 25.321,
1265
+ "eval_runtime": 1004.8822,
1266
+ "eval_samples_per_second": 1.112,
1267
+ "eval_steps_per_second": 0.279,
1268
+ "step": 4469
1269
+ },
1270
+ {
1271
+ "epoch": 1.0013425822331619,
1272
+ "grad_norm": 15.127477645874023,
1273
+ "learning_rate": 3.7042101773578655e-05,
1274
+ "loss": 3.9898,
1275
+ "step": 4475
1276
+ },
1277
+ {
1278
+ "epoch": 1.006936674871336,
1279
+ "grad_norm": 10.354872703552246,
1280
+ "learning_rate": 3.693850488977292e-05,
1281
+ "loss": 3.623,
1282
+ "step": 4500
1283
+ },
1284
+ {
1285
+ "epoch": 1.0125307675095099,
1286
+ "grad_norm": 13.131593704223633,
1287
+ "learning_rate": 3.683490800596718e-05,
1288
+ "loss": 3.7952,
1289
+ "step": 4525
1290
+ },
1291
+ {
1292
+ "epoch": 1.018124860147684,
1293
+ "grad_norm": 10.636340141296387,
1294
+ "learning_rate": 3.673131112216145e-05,
1295
+ "loss": 3.4462,
1296
+ "step": 4550
1297
+ },
1298
+ {
1299
+ "epoch": 1.0237189527858581,
1300
+ "grad_norm": 14.03700065612793,
1301
+ "learning_rate": 3.662771423835571e-05,
1302
+ "loss": 3.4108,
1303
+ "step": 4575
1304
+ },
1305
+ {
1306
+ "epoch": 1.0293130454240322,
1307
+ "grad_norm": 21.94048309326172,
1308
+ "learning_rate": 3.652411735454997e-05,
1309
+ "loss": 3.7664,
1310
+ "step": 4600
1311
+ },
1312
+ {
1313
+ "epoch": 1.0349071380622064,
1314
+ "grad_norm": 11.382323265075684,
1315
+ "learning_rate": 3.642052047074424e-05,
1316
+ "loss": 3.7189,
1317
+ "step": 4625
1318
+ },
1319
+ {
1320
+ "epoch": 1.0405012307003805,
1321
+ "grad_norm": 11.167036056518555,
1322
+ "learning_rate": 3.6316923586938505e-05,
1323
+ "loss": 3.7947,
1324
+ "step": 4650
1325
+ },
1326
+ {
1327
+ "epoch": 1.0460953233385544,
1328
+ "grad_norm": 13.024956703186035,
1329
+ "learning_rate": 3.621332670313277e-05,
1330
+ "loss": 3.6083,
1331
+ "step": 4675
1332
+ },
1333
+ {
1334
+ "epoch": 1.0516894159767285,
1335
+ "grad_norm": 11.757680892944336,
1336
+ "learning_rate": 3.610972981932704e-05,
1337
+ "loss": 3.5998,
1338
+ "step": 4700
1339
+ },
1340
+ {
1341
+ "epoch": 1.0572835086149026,
1342
+ "grad_norm": 14.893111228942871,
1343
+ "learning_rate": 3.60061329355213e-05,
1344
+ "loss": 3.6173,
1345
+ "step": 4725
1346
+ },
1347
+ {
1348
+ "epoch": 1.0628776012530767,
1349
+ "grad_norm": 9.222747802734375,
1350
+ "learning_rate": 3.5902536051715565e-05,
1351
+ "loss": 3.6453,
1352
+ "step": 4750
1353
+ },
1354
+ {
1355
+ "epoch": 1.0684716938912509,
1356
+ "grad_norm": 25.488880157470703,
1357
+ "learning_rate": 3.579893916790983e-05,
1358
+ "loss": 3.3486,
1359
+ "step": 4775
1360
+ },
1361
+ {
1362
+ "epoch": 1.074065786529425,
1363
+ "grad_norm": 10.05694580078125,
1364
+ "learning_rate": 3.569534228410409e-05,
1365
+ "loss": 3.7705,
1366
+ "step": 4800
1367
+ },
1368
+ {
1369
+ "epoch": 1.079659879167599,
1370
+ "grad_norm": 12.402889251708984,
1371
+ "learning_rate": 3.559174540029836e-05,
1372
+ "loss": 3.3739,
1373
+ "step": 4825
1374
+ },
1375
+ {
1376
+ "epoch": 1.085253971805773,
1377
+ "grad_norm": 10.890093803405762,
1378
+ "learning_rate": 3.5488148516492625e-05,
1379
+ "loss": 3.7975,
1380
+ "step": 4850
1381
+ },
1382
+ {
1383
+ "epoch": 1.090848064443947,
1384
+ "grad_norm": 12.410653114318848,
1385
+ "learning_rate": 3.538455163268689e-05,
1386
+ "loss": 3.5483,
1387
+ "step": 4875
1388
+ },
1389
+ {
1390
+ "epoch": 1.0964421570821212,
1391
+ "grad_norm": 11.636606216430664,
1392
+ "learning_rate": 3.528095474888116e-05,
1393
+ "loss": 3.5182,
1394
+ "step": 4900
1395
+ },
1396
+ {
1397
+ "epoch": 1.1020362497202953,
1398
+ "grad_norm": 14.367986679077148,
1399
+ "learning_rate": 3.517735786507542e-05,
1400
+ "loss": 3.9954,
1401
+ "step": 4925
1402
+ },
1403
+ {
1404
+ "epoch": 1.1076303423584695,
1405
+ "grad_norm": 10.753607749938965,
1406
+ "learning_rate": 3.5073760981269685e-05,
1407
+ "loss": 3.8582,
1408
+ "step": 4950
1409
+ },
1410
+ {
1411
+ "epoch": 1.1132244349966436,
1412
+ "grad_norm": 9.407801628112793,
1413
+ "learning_rate": 3.497016409746395e-05,
1414
+ "loss": 3.6181,
1415
+ "step": 4975
1416
+ },
1417
+ {
1418
+ "epoch": 1.1188185276348177,
1419
+ "grad_norm": 9.98642349243164,
1420
+ "learning_rate": 3.486656721365821e-05,
1421
+ "loss": 3.5989,
1422
+ "step": 5000
1423
+ },
1424
+ {
1425
+ "epoch": 1.1244126202729916,
1426
+ "grad_norm": 9.880094528198242,
1427
+ "learning_rate": 3.476297032985248e-05,
1428
+ "loss": 3.658,
1429
+ "step": 5025
1430
+ },
1431
+ {
1432
+ "epoch": 1.1300067129111657,
1433
+ "grad_norm": 14.001792907714844,
1434
+ "learning_rate": 3.4659373446046745e-05,
1435
+ "loss": 3.7073,
1436
+ "step": 5050
1437
+ },
1438
+ {
1439
+ "epoch": 1.1356008055493398,
1440
+ "grad_norm": 18.54832649230957,
1441
+ "learning_rate": 3.455577656224101e-05,
1442
+ "loss": 3.7806,
1443
+ "step": 5075
1444
+ },
1445
+ {
1446
+ "epoch": 1.141194898187514,
1447
+ "grad_norm": 9.804744720458984,
1448
+ "learning_rate": 3.445217967843528e-05,
1449
+ "loss": 3.6671,
1450
+ "step": 5100
1451
+ },
1452
+ {
1453
+ "epoch": 1.146788990825688,
1454
+ "grad_norm": 8.401939392089844,
1455
+ "learning_rate": 3.434858279462954e-05,
1456
+ "loss": 3.8566,
1457
+ "step": 5125
1458
+ },
1459
+ {
1460
+ "epoch": 1.1523830834638622,
1461
+ "grad_norm": 10.120752334594727,
1462
+ "learning_rate": 3.42449859108238e-05,
1463
+ "loss": 3.8143,
1464
+ "step": 5150
1465
+ },
1466
+ {
1467
+ "epoch": 1.1579771761020363,
1468
+ "grad_norm": 16.10240364074707,
1469
+ "learning_rate": 3.414138902701807e-05,
1470
+ "loss": 3.7451,
1471
+ "step": 5175
1472
+ },
1473
+ {
1474
+ "epoch": 1.1635712687402102,
1475
+ "grad_norm": 10.377949714660645,
1476
+ "learning_rate": 3.403779214321233e-05,
1477
+ "loss": 3.6082,
1478
+ "step": 5200
1479
+ },
1480
+ {
1481
+ "epoch": 1.1691653613783843,
1482
+ "grad_norm": 10.826866149902344,
1483
+ "learning_rate": 3.3934195259406595e-05,
1484
+ "loss": 3.8001,
1485
+ "step": 5225
1486
+ },
1487
+ {
1488
+ "epoch": 1.1747594540165585,
1489
+ "grad_norm": 10.02441120147705,
1490
+ "learning_rate": 3.3830598375600866e-05,
1491
+ "loss": 3.6624,
1492
+ "step": 5250
1493
+ },
1494
+ {
1495
+ "epoch": 1.1803535466547326,
1496
+ "grad_norm": 15.683877944946289,
1497
+ "learning_rate": 3.372700149179513e-05,
1498
+ "loss": 3.5834,
1499
+ "step": 5275
1500
+ },
1501
+ {
1502
+ "epoch": 1.1859476392929067,
1503
+ "grad_norm": 11.696283340454102,
1504
+ "learning_rate": 3.362340460798939e-05,
1505
+ "loss": 3.6069,
1506
+ "step": 5300
1507
+ },
1508
+ {
1509
+ "epoch": 1.1915417319310808,
1510
+ "grad_norm": 13.27725601196289,
1511
+ "learning_rate": 3.351980772418366e-05,
1512
+ "loss": 3.6614,
1513
+ "step": 5325
1514
+ },
1515
+ {
1516
+ "epoch": 1.197135824569255,
1517
+ "grad_norm": 11.811793327331543,
1518
+ "learning_rate": 3.341621084037792e-05,
1519
+ "loss": 3.6186,
1520
+ "step": 5350
1521
+ },
1522
+ {
1523
+ "epoch": 1.2027299172074288,
1524
+ "grad_norm": 30.400972366333008,
1525
+ "learning_rate": 3.331261395657219e-05,
1526
+ "loss": 3.6704,
1527
+ "step": 5375
1528
+ },
1529
+ {
1530
+ "epoch": 1.208324009845603,
1531
+ "grad_norm": 11.845870018005371,
1532
+ "learning_rate": 3.320901707276645e-05,
1533
+ "loss": 3.6688,
1534
+ "step": 5400
1535
+ },
1536
+ {
1537
+ "epoch": 1.213918102483777,
1538
+ "grad_norm": 14.447372436523438,
1539
+ "learning_rate": 3.3105420188960716e-05,
1540
+ "loss": 3.8172,
1541
+ "step": 5425
1542
+ },
1543
+ {
1544
+ "epoch": 1.2195121951219512,
1545
+ "grad_norm": 9.492889404296875,
1546
+ "learning_rate": 3.3001823305154986e-05,
1547
+ "loss": 3.5741,
1548
+ "step": 5450
1549
+ },
1550
+ {
1551
+ "epoch": 1.2251062877601253,
1552
+ "grad_norm": 12.105642318725586,
1553
+ "learning_rate": 3.289822642134925e-05,
1554
+ "loss": 3.5402,
1555
+ "step": 5475
1556
+ },
1557
+ {
1558
+ "epoch": 1.2307003803982994,
1559
+ "grad_norm": 7.518635272979736,
1560
+ "learning_rate": 3.279462953754351e-05,
1561
+ "loss": 3.7333,
1562
+ "step": 5500
1563
+ },
1564
+ {
1565
+ "epoch": 1.2362944730364736,
1566
+ "grad_norm": 11.485749244689941,
1567
+ "learning_rate": 3.269103265373778e-05,
1568
+ "loss": 3.8156,
1569
+ "step": 5525
1570
+ },
1571
+ {
1572
+ "epoch": 1.2418885656746474,
1573
+ "grad_norm": 11.726677894592285,
1574
+ "learning_rate": 3.258743576993204e-05,
1575
+ "loss": 3.937,
1576
+ "step": 5550
1577
+ },
1578
+ {
1579
+ "epoch": 1.2474826583128216,
1580
+ "grad_norm": 13.454861640930176,
1581
+ "learning_rate": 3.24838388861263e-05,
1582
+ "loss": 3.5792,
1583
+ "step": 5575
1584
+ },
1585
+ {
1586
+ "epoch": 1.2530767509509957,
1587
+ "grad_norm": 17.696428298950195,
1588
+ "learning_rate": 3.238024200232057e-05,
1589
+ "loss": 3.5828,
1590
+ "step": 5600
1591
+ },
1592
+ {
1593
+ "epoch": 1.2586708435891698,
1594
+ "grad_norm": 12.128670692443848,
1595
+ "learning_rate": 3.2276645118514836e-05,
1596
+ "loss": 3.6379,
1597
+ "step": 5625
1598
+ },
1599
+ {
1600
+ "epoch": 1.264264936227344,
1601
+ "grad_norm": 11.507698059082031,
1602
+ "learning_rate": 3.21730482347091e-05,
1603
+ "loss": 3.5192,
1604
+ "step": 5650
1605
+ },
1606
+ {
1607
+ "epoch": 1.269859028865518,
1608
+ "grad_norm": 11.207321166992188,
1609
+ "learning_rate": 3.206945135090337e-05,
1610
+ "loss": 3.885,
1611
+ "step": 5675
1612
+ },
1613
+ {
1614
+ "epoch": 1.2754531215036922,
1615
+ "grad_norm": 9.954567909240723,
1616
+ "learning_rate": 3.196585446709763e-05,
1617
+ "loss": 3.6138,
1618
+ "step": 5700
1619
+ },
1620
+ {
1621
+ "epoch": 1.281047214141866,
1622
+ "grad_norm": 9.7274751663208,
1623
+ "learning_rate": 3.1862257583291896e-05,
1624
+ "loss": 3.7395,
1625
+ "step": 5725
1626
+ },
1627
+ {
1628
+ "epoch": 1.2866413067800404,
1629
+ "grad_norm": 10.094833374023438,
1630
+ "learning_rate": 3.175866069948616e-05,
1631
+ "loss": 3.6801,
1632
+ "step": 5750
1633
+ },
1634
+ {
1635
+ "epoch": 1.2922353994182143,
1636
+ "grad_norm": 12.403266906738281,
1637
+ "learning_rate": 3.165506381568042e-05,
1638
+ "loss": 3.6891,
1639
+ "step": 5775
1640
+ },
1641
+ {
1642
+ "epoch": 1.2978294920563884,
1643
+ "grad_norm": 13.569632530212402,
1644
+ "learning_rate": 3.155146693187469e-05,
1645
+ "loss": 3.392,
1646
+ "step": 5800
1647
+ },
1648
+ {
1649
+ "epoch": 1.3034235846945625,
1650
+ "grad_norm": 10.21789836883545,
1651
+ "learning_rate": 3.1447870048068956e-05,
1652
+ "loss": 3.4392,
1653
+ "step": 5825
1654
+ },
1655
+ {
1656
+ "epoch": 1.3090176773327367,
1657
+ "grad_norm": 9.875311851501465,
1658
+ "learning_rate": 3.134427316426322e-05,
1659
+ "loss": 3.6576,
1660
+ "step": 5850
1661
+ },
1662
+ {
1663
+ "epoch": 1.3146117699709108,
1664
+ "grad_norm": 13.931588172912598,
1665
+ "learning_rate": 3.124067628045749e-05,
1666
+ "loss": 3.869,
1667
+ "step": 5875
1668
+ },
1669
+ {
1670
+ "epoch": 1.3202058626090847,
1671
+ "grad_norm": 9.532690048217773,
1672
+ "learning_rate": 3.113707939665175e-05,
1673
+ "loss": 3.7711,
1674
+ "step": 5900
1675
+ },
1676
+ {
1677
+ "epoch": 1.325799955247259,
1678
+ "grad_norm": 9.777695655822754,
1679
+ "learning_rate": 3.1033482512846016e-05,
1680
+ "loss": 3.4998,
1681
+ "step": 5925
1682
+ },
1683
+ {
1684
+ "epoch": 1.331394047885433,
1685
+ "grad_norm": 10.633079528808594,
1686
+ "learning_rate": 3.092988562904028e-05,
1687
+ "loss": 3.6,
1688
+ "step": 5950
1689
+ },
1690
+ {
1691
+ "epoch": 1.336988140523607,
1692
+ "grad_norm": 11.918797492980957,
1693
+ "learning_rate": 3.082628874523454e-05,
1694
+ "loss": 3.6084,
1695
+ "step": 5975
1696
+ },
1697
+ {
1698
+ "epoch": 1.3425822331617812,
1699
+ "grad_norm": 11.27762222290039,
1700
+ "learning_rate": 3.072269186142881e-05,
1701
+ "loss": 3.6122,
1702
+ "step": 6000
1703
+ },
1704
+ {
1705
+ "epoch": 1.3481763257999553,
1706
+ "grad_norm": 18.506898880004883,
1707
+ "learning_rate": 3.0619094977623076e-05,
1708
+ "loss": 3.2751,
1709
+ "step": 6025
1710
+ },
1711
+ {
1712
+ "epoch": 1.3537704184381294,
1713
+ "grad_norm": 21.525028228759766,
1714
+ "learning_rate": 3.051549809381734e-05,
1715
+ "loss": 3.4856,
1716
+ "step": 6050
1717
+ },
1718
+ {
1719
+ "epoch": 1.3593645110763033,
1720
+ "grad_norm": 13.029548645019531,
1721
+ "learning_rate": 3.0411901210011606e-05,
1722
+ "loss": 3.5935,
1723
+ "step": 6075
1724
+ },
1725
+ {
1726
+ "epoch": 1.3649586037144776,
1727
+ "grad_norm": 9.536824226379395,
1728
+ "learning_rate": 3.0308304326205873e-05,
1729
+ "loss": 3.569,
1730
+ "step": 6100
1731
+ },
1732
+ {
1733
+ "epoch": 1.3705526963526515,
1734
+ "grad_norm": 11.298256874084473,
1735
+ "learning_rate": 3.0204707442400133e-05,
1736
+ "loss": 3.8319,
1737
+ "step": 6125
1738
+ },
1739
+ {
1740
+ "epoch": 1.3761467889908257,
1741
+ "grad_norm": 8.461627006530762,
1742
+ "learning_rate": 3.0101110558594396e-05,
1743
+ "loss": 3.7151,
1744
+ "step": 6150
1745
+ },
1746
+ {
1747
+ "epoch": 1.3817408816289998,
1748
+ "grad_norm": 8.925524711608887,
1749
+ "learning_rate": 2.9997513674788663e-05,
1750
+ "loss": 3.5296,
1751
+ "step": 6175
1752
+ },
1753
+ {
1754
+ "epoch": 1.387334974267174,
1755
+ "grad_norm": 12.109110832214355,
1756
+ "learning_rate": 2.989391679098293e-05,
1757
+ "loss": 3.4691,
1758
+ "step": 6200
1759
+ },
1760
+ {
1761
+ "epoch": 1.392929066905348,
1762
+ "grad_norm": 9.915245056152344,
1763
+ "learning_rate": 2.9790319907177193e-05,
1764
+ "loss": 3.5228,
1765
+ "step": 6225
1766
+ },
1767
+ {
1768
+ "epoch": 1.398523159543522,
1769
+ "grad_norm": 12.354578018188477,
1770
+ "learning_rate": 2.968672302337146e-05,
1771
+ "loss": 3.5543,
1772
+ "step": 6250
1773
+ },
1774
+ {
1775
+ "epoch": 1.4041172521816963,
1776
+ "grad_norm": 13.05074405670166,
1777
+ "learning_rate": 2.9583126139565726e-05,
1778
+ "loss": 3.5762,
1779
+ "step": 6275
1780
+ },
1781
+ {
1782
+ "epoch": 1.4097113448198701,
1783
+ "grad_norm": 9.809946060180664,
1784
+ "learning_rate": 2.9479529255759986e-05,
1785
+ "loss": 3.4119,
1786
+ "step": 6300
1787
+ },
1788
+ {
1789
+ "epoch": 1.4153054374580443,
1790
+ "grad_norm": 10.61130142211914,
1791
+ "learning_rate": 2.937593237195425e-05,
1792
+ "loss": 3.5493,
1793
+ "step": 6325
1794
+ },
1795
+ {
1796
+ "epoch": 1.4208995300962184,
1797
+ "grad_norm": 8.541769027709961,
1798
+ "learning_rate": 2.9272335488148516e-05,
1799
+ "loss": 3.607,
1800
+ "step": 6350
1801
+ },
1802
+ {
1803
+ "epoch": 1.4264936227343925,
1804
+ "grad_norm": 14.672987937927246,
1805
+ "learning_rate": 2.9168738604342783e-05,
1806
+ "loss": 3.3681,
1807
+ "step": 6375
1808
+ },
1809
+ {
1810
+ "epoch": 1.4320877153725666,
1811
+ "grad_norm": 16.417062759399414,
1812
+ "learning_rate": 2.9065141720537046e-05,
1813
+ "loss": 3.581,
1814
+ "step": 6400
1815
+ },
1816
+ {
1817
+ "epoch": 1.4376818080107405,
1818
+ "grad_norm": 11.234773635864258,
1819
+ "learning_rate": 2.8961544836731313e-05,
1820
+ "loss": 3.8083,
1821
+ "step": 6425
1822
+ },
1823
+ {
1824
+ "epoch": 1.4432759006489149,
1825
+ "grad_norm": 8.443843841552734,
1826
+ "learning_rate": 2.885794795292558e-05,
1827
+ "loss": 3.4413,
1828
+ "step": 6450
1829
+ },
1830
+ {
1831
+ "epoch": 1.4488699932870888,
1832
+ "grad_norm": 11.358126640319824,
1833
+ "learning_rate": 2.8754351069119843e-05,
1834
+ "loss": 3.5337,
1835
+ "step": 6475
1836
+ },
1837
+ {
1838
+ "epoch": 1.4544640859252629,
1839
+ "grad_norm": 15.18321418762207,
1840
+ "learning_rate": 2.8650754185314106e-05,
1841
+ "loss": 3.7784,
1842
+ "step": 6500
1843
+ },
1844
+ {
1845
+ "epoch": 1.460058178563437,
1846
+ "grad_norm": 10.894759178161621,
1847
+ "learning_rate": 2.854715730150837e-05,
1848
+ "loss": 3.736,
1849
+ "step": 6525
1850
+ },
1851
+ {
1852
+ "epoch": 1.4656522712016111,
1853
+ "grad_norm": 8.780854225158691,
1854
+ "learning_rate": 2.8443560417702636e-05,
1855
+ "loss": 3.5959,
1856
+ "step": 6550
1857
+ },
1858
+ {
1859
+ "epoch": 1.4712463638397852,
1860
+ "grad_norm": 16.36128044128418,
1861
+ "learning_rate": 2.8339963533896903e-05,
1862
+ "loss": 3.6979,
1863
+ "step": 6575
1864
+ },
1865
+ {
1866
+ "epoch": 1.4768404564779591,
1867
+ "grad_norm": 10.243796348571777,
1868
+ "learning_rate": 2.8236366650091166e-05,
1869
+ "loss": 3.6813,
1870
+ "step": 6600
1871
+ },
1872
+ {
1873
+ "epoch": 1.4824345491161335,
1874
+ "grad_norm": 13.104743003845215,
1875
+ "learning_rate": 2.8132769766285433e-05,
1876
+ "loss": 3.6853,
1877
+ "step": 6625
1878
+ },
1879
+ {
1880
+ "epoch": 1.4880286417543074,
1881
+ "grad_norm": 11.391397476196289,
1882
+ "learning_rate": 2.80291728824797e-05,
1883
+ "loss": 3.5911,
1884
+ "step": 6650
1885
+ },
1886
+ {
1887
+ "epoch": 1.4936227343924815,
1888
+ "grad_norm": 11.6659574508667,
1889
+ "learning_rate": 2.792557599867396e-05,
1890
+ "loss": 3.5579,
1891
+ "step": 6675
1892
+ },
1893
+ {
1894
+ "epoch": 1.4992168270306556,
1895
+ "grad_norm": 11.695647239685059,
1896
+ "learning_rate": 2.7821979114868223e-05,
1897
+ "loss": 3.5243,
1898
+ "step": 6700
1899
+ },
1900
+ {
1901
+ "epoch": 1.5048109196688297,
1902
+ "grad_norm": 9.701094627380371,
1903
+ "learning_rate": 2.771838223106249e-05,
1904
+ "loss": 3.6699,
1905
+ "step": 6725
1906
+ },
1907
+ {
1908
+ "epoch": 1.5104050123070039,
1909
+ "grad_norm": 15.949247360229492,
1910
+ "learning_rate": 2.7614785347256757e-05,
1911
+ "loss": 3.7613,
1912
+ "step": 6750
1913
+ },
1914
+ {
1915
+ "epoch": 1.5159991049451778,
1916
+ "grad_norm": 13.379142761230469,
1917
+ "learning_rate": 2.751118846345102e-05,
1918
+ "loss": 3.7097,
1919
+ "step": 6775
1920
+ },
1921
+ {
1922
+ "epoch": 1.521593197583352,
1923
+ "grad_norm": 10.693124771118164,
1924
+ "learning_rate": 2.7407591579645287e-05,
1925
+ "loss": 3.7779,
1926
+ "step": 6800
1927
+ },
1928
+ {
1929
+ "epoch": 1.527187290221526,
1930
+ "grad_norm": 7.9651312828063965,
1931
+ "learning_rate": 2.7303994695839553e-05,
1932
+ "loss": 3.6414,
1933
+ "step": 6825
1934
+ },
1935
+ {
1936
+ "epoch": 1.5327813828597001,
1937
+ "grad_norm": 11.157812118530273,
1938
+ "learning_rate": 2.7200397812033817e-05,
1939
+ "loss": 3.6576,
1940
+ "step": 6850
1941
+ },
1942
+ {
1943
+ "epoch": 1.5383754754978742,
1944
+ "grad_norm": 12.323993682861328,
1945
+ "learning_rate": 2.7096800928228077e-05,
1946
+ "loss": 3.4723,
1947
+ "step": 6875
1948
+ },
1949
+ {
1950
+ "epoch": 1.5439695681360484,
1951
+ "grad_norm": 10.302526473999023,
1952
+ "learning_rate": 2.6993204044422343e-05,
1953
+ "loss": 3.7213,
1954
+ "step": 6900
1955
+ },
1956
+ {
1957
+ "epoch": 1.5495636607742225,
1958
+ "grad_norm": 10.622782707214355,
1959
+ "learning_rate": 2.688960716061661e-05,
1960
+ "loss": 3.582,
1961
+ "step": 6925
1962
+ },
1963
+ {
1964
+ "epoch": 1.5551577534123964,
1965
+ "grad_norm": 12.592206001281738,
1966
+ "learning_rate": 2.6786010276810873e-05,
1967
+ "loss": 3.524,
1968
+ "step": 6950
1969
+ },
1970
+ {
1971
+ "epoch": 1.5607518460505707,
1972
+ "grad_norm": 10.893891334533691,
1973
+ "learning_rate": 2.668241339300514e-05,
1974
+ "loss": 3.4272,
1975
+ "step": 6975
1976
+ },
1977
+ {
1978
+ "epoch": 1.5663459386887446,
1979
+ "grad_norm": 11.728677749633789,
1980
+ "learning_rate": 2.6578816509199407e-05,
1981
+ "loss": 3.5618,
1982
+ "step": 7000
1983
+ },
1984
+ {
1985
+ "epoch": 1.5719400313269187,
1986
+ "grad_norm": 14.411259651184082,
1987
+ "learning_rate": 2.647521962539367e-05,
1988
+ "loss": 3.6041,
1989
+ "step": 7025
1990
+ },
1991
+ {
1992
+ "epoch": 1.5775341239650928,
1993
+ "grad_norm": 12.213258743286133,
1994
+ "learning_rate": 2.6371622741587937e-05,
1995
+ "loss": 3.5627,
1996
+ "step": 7050
1997
+ },
1998
+ {
1999
+ "epoch": 1.583128216603267,
2000
+ "grad_norm": 12.341785430908203,
2001
+ "learning_rate": 2.6268025857782197e-05,
2002
+ "loss": 3.631,
2003
+ "step": 7075
2004
+ },
2005
+ {
2006
+ "epoch": 1.588722309241441,
2007
+ "grad_norm": 22.097862243652344,
2008
+ "learning_rate": 2.6164428973976464e-05,
2009
+ "loss": 3.5977,
2010
+ "step": 7100
2011
+ },
2012
+ {
2013
+ "epoch": 1.594316401879615,
2014
+ "grad_norm": 11.355073928833008,
2015
+ "learning_rate": 2.6060832090170727e-05,
2016
+ "loss": 3.6045,
2017
+ "step": 7125
2018
+ },
2019
+ {
2020
+ "epoch": 1.5999104945177893,
2021
+ "grad_norm": 12.087318420410156,
2022
+ "learning_rate": 2.5957235206364994e-05,
2023
+ "loss": 3.5971,
2024
+ "step": 7150
2025
+ },
2026
+ {
2027
+ "epoch": 1.6055045871559632,
2028
+ "grad_norm": 12.888108253479004,
2029
+ "learning_rate": 2.585363832255926e-05,
2030
+ "loss": 3.4293,
2031
+ "step": 7175
2032
+ },
2033
+ {
2034
+ "epoch": 1.6110986797941373,
2035
+ "grad_norm": 11.495614051818848,
2036
+ "learning_rate": 2.5750041438753524e-05,
2037
+ "loss": 3.9471,
2038
+ "step": 7200
2039
+ },
2040
+ {
2041
+ "epoch": 1.6166927724323115,
2042
+ "grad_norm": 12.585895538330078,
2043
+ "learning_rate": 2.564644455494779e-05,
2044
+ "loss": 3.5671,
2045
+ "step": 7225
2046
+ },
2047
+ {
2048
+ "epoch": 1.6222868650704856,
2049
+ "grad_norm": 8.79129409790039,
2050
+ "learning_rate": 2.554284767114205e-05,
2051
+ "loss": 3.6453,
2052
+ "step": 7250
2053
+ },
2054
+ {
2055
+ "epoch": 1.6278809577086597,
2056
+ "grad_norm": 10.552870750427246,
2057
+ "learning_rate": 2.5439250787336317e-05,
2058
+ "loss": 3.5649,
2059
+ "step": 7275
2060
+ },
2061
+ {
2062
+ "epoch": 1.6334750503468336,
2063
+ "grad_norm": 15.649266242980957,
2064
+ "learning_rate": 2.533565390353058e-05,
2065
+ "loss": 3.5291,
2066
+ "step": 7300
2067
+ },
2068
+ {
2069
+ "epoch": 1.639069142985008,
2070
+ "grad_norm": 13.944864273071289,
2071
+ "learning_rate": 2.5232057019724847e-05,
2072
+ "loss": 3.3558,
2073
+ "step": 7325
2074
+ },
2075
+ {
2076
+ "epoch": 1.6446632356231818,
2077
+ "grad_norm": 14.092317581176758,
2078
+ "learning_rate": 2.5128460135919114e-05,
2079
+ "loss": 3.6211,
2080
+ "step": 7350
2081
+ },
2082
+ {
2083
+ "epoch": 1.650257328261356,
2084
+ "grad_norm": 12.18454647064209,
2085
+ "learning_rate": 2.5024863252113377e-05,
2086
+ "loss": 3.7732,
2087
+ "step": 7375
2088
+ },
2089
+ {
2090
+ "epoch": 1.65585142089953,
2091
+ "grad_norm": 10.779006958007812,
2092
+ "learning_rate": 2.492126636830764e-05,
2093
+ "loss": 3.6804,
2094
+ "step": 7400
2095
+ },
2096
+ {
2097
+ "epoch": 1.6614455135377042,
2098
+ "grad_norm": 10.292470932006836,
2099
+ "learning_rate": 2.4817669484501907e-05,
2100
+ "loss": 3.5316,
2101
+ "step": 7425
2102
+ },
2103
+ {
2104
+ "epoch": 1.6670396061758783,
2105
+ "grad_norm": 13.621623039245605,
2106
+ "learning_rate": 2.4714072600696174e-05,
2107
+ "loss": 3.4568,
2108
+ "step": 7450
2109
+ },
2110
+ {
2111
+ "epoch": 1.6726336988140522,
2112
+ "grad_norm": 11.503890991210938,
2113
+ "learning_rate": 2.4610475716890437e-05,
2114
+ "loss": 3.3641,
2115
+ "step": 7475
2116
+ },
2117
+ {
2118
+ "epoch": 1.6782277914522266,
2119
+ "grad_norm": 13.564423561096191,
2120
+ "learning_rate": 2.45068788330847e-05,
2121
+ "loss": 3.5669,
2122
+ "step": 7500
2123
+ },
2124
+ {
2125
+ "epoch": 1.6838218840904005,
2126
+ "grad_norm": 9.360475540161133,
2127
+ "learning_rate": 2.4403281949278967e-05,
2128
+ "loss": 3.6688,
2129
+ "step": 7525
2130
+ },
2131
+ {
2132
+ "epoch": 1.6894159767285746,
2133
+ "grad_norm": 13.366150856018066,
2134
+ "learning_rate": 2.4299685065473234e-05,
2135
+ "loss": 3.3199,
2136
+ "step": 7550
2137
+ },
2138
+ {
2139
+ "epoch": 1.6950100693667487,
2140
+ "grad_norm": 10.094856262207031,
2141
+ "learning_rate": 2.4196088181667497e-05,
2142
+ "loss": 3.4633,
2143
+ "step": 7575
2144
+ },
2145
+ {
2146
+ "epoch": 1.7006041620049228,
2147
+ "grad_norm": 9.134540557861328,
2148
+ "learning_rate": 2.409249129786176e-05,
2149
+ "loss": 3.6078,
2150
+ "step": 7600
2151
+ },
2152
+ {
2153
+ "epoch": 1.706198254643097,
2154
+ "grad_norm": 11.252095222473145,
2155
+ "learning_rate": 2.3988894414056027e-05,
2156
+ "loss": 3.7198,
2157
+ "step": 7625
2158
+ },
2159
+ {
2160
+ "epoch": 1.7117923472812708,
2161
+ "grad_norm": 6.201746940612793,
2162
+ "learning_rate": 2.388529753025029e-05,
2163
+ "loss": 3.5264,
2164
+ "step": 7650
2165
+ },
2166
+ {
2167
+ "epoch": 1.7173864399194452,
2168
+ "grad_norm": 12.331997871398926,
2169
+ "learning_rate": 2.3781700646444557e-05,
2170
+ "loss": 3.4556,
2171
+ "step": 7675
2172
+ },
2173
+ {
2174
+ "epoch": 1.722980532557619,
2175
+ "grad_norm": 12.859444618225098,
2176
+ "learning_rate": 2.367810376263882e-05,
2177
+ "loss": 3.4533,
2178
+ "step": 7700
2179
+ },
2180
+ {
2181
+ "epoch": 1.7285746251957932,
2182
+ "grad_norm": 13.243956565856934,
2183
+ "learning_rate": 2.3574506878833087e-05,
2184
+ "loss": 3.7355,
2185
+ "step": 7725
2186
+ },
2187
+ {
2188
+ "epoch": 1.7341687178339673,
2189
+ "grad_norm": 11.75436019897461,
2190
+ "learning_rate": 2.347090999502735e-05,
2191
+ "loss": 3.6108,
2192
+ "step": 7750
2193
+ },
2194
+ {
2195
+ "epoch": 1.7397628104721414,
2196
+ "grad_norm": 13.429585456848145,
2197
+ "learning_rate": 2.3367313111221614e-05,
2198
+ "loss": 3.7925,
2199
+ "step": 7775
2200
+ },
2201
+ {
2202
+ "epoch": 1.7453569031103155,
2203
+ "grad_norm": 11.077943801879883,
2204
+ "learning_rate": 2.326371622741588e-05,
2205
+ "loss": 3.6788,
2206
+ "step": 7800
2207
+ },
2208
+ {
2209
+ "epoch": 1.7509509957484894,
2210
+ "grad_norm": 7.700258731842041,
2211
+ "learning_rate": 2.3160119343610144e-05,
2212
+ "loss": 3.4562,
2213
+ "step": 7825
2214
+ },
2215
+ {
2216
+ "epoch": 1.7565450883866638,
2217
+ "grad_norm": 11.34974479675293,
2218
+ "learning_rate": 2.305652245980441e-05,
2219
+ "loss": 3.3855,
2220
+ "step": 7850
2221
+ },
2222
+ {
2223
+ "epoch": 1.7621391810248377,
2224
+ "grad_norm": 11.598840713500977,
2225
+ "learning_rate": 2.2952925575998674e-05,
2226
+ "loss": 3.6097,
2227
+ "step": 7875
2228
+ },
2229
+ {
2230
+ "epoch": 1.7677332736630118,
2231
+ "grad_norm": 11.69258975982666,
2232
+ "learning_rate": 2.284932869219294e-05,
2233
+ "loss": 3.6146,
2234
+ "step": 7900
2235
+ },
2236
+ {
2237
+ "epoch": 1.773327366301186,
2238
+ "grad_norm": 10.501328468322754,
2239
+ "learning_rate": 2.2745731808387204e-05,
2240
+ "loss": 3.6597,
2241
+ "step": 7925
2242
+ },
2243
+ {
2244
+ "epoch": 1.77892145893936,
2245
+ "grad_norm": 12.03715705871582,
2246
+ "learning_rate": 2.264213492458147e-05,
2247
+ "loss": 3.6975,
2248
+ "step": 7950
2249
+ },
2250
+ {
2251
+ "epoch": 1.7845155515775342,
2252
+ "grad_norm": 13.386404991149902,
2253
+ "learning_rate": 2.2538538040775734e-05,
2254
+ "loss": 3.6164,
2255
+ "step": 7975
2256
+ },
2257
+ {
2258
+ "epoch": 1.790109644215708,
2259
+ "grad_norm": 9.877335548400879,
2260
+ "learning_rate": 2.2434941156969997e-05,
2261
+ "loss": 3.614,
2262
+ "step": 8000
2263
+ },
2264
+ {
2265
+ "epoch": 1.7957037368538824,
2266
+ "grad_norm": 12.025654792785645,
2267
+ "learning_rate": 2.2335488148516494e-05,
2268
+ "loss": 3.743,
2269
+ "step": 8025
2270
+ },
2271
+ {
2272
+ "epoch": 1.8012978294920563,
2273
+ "grad_norm": 12.508997917175293,
2274
+ "learning_rate": 2.2231891264710757e-05,
2275
+ "loss": 3.7309,
2276
+ "step": 8050
2277
+ },
2278
+ {
2279
+ "epoch": 1.8068919221302304,
2280
+ "grad_norm": 12.994415283203125,
2281
+ "learning_rate": 2.2128294380905024e-05,
2282
+ "loss": 3.5284,
2283
+ "step": 8075
2284
+ },
2285
+ {
2286
+ "epoch": 1.8124860147684045,
2287
+ "grad_norm": 12.859843254089355,
2288
+ "learning_rate": 2.202469749709929e-05,
2289
+ "loss": 3.686,
2290
+ "step": 8100
2291
+ },
2292
+ {
2293
+ "epoch": 1.8180801074065787,
2294
+ "grad_norm": 15.91470718383789,
2295
+ "learning_rate": 2.192110061329355e-05,
2296
+ "loss": 3.3395,
2297
+ "step": 8125
2298
+ },
2299
+ {
2300
+ "epoch": 1.8236742000447528,
2301
+ "grad_norm": 10.755178451538086,
2302
+ "learning_rate": 2.1817503729487817e-05,
2303
+ "loss": 3.6407,
2304
+ "step": 8150
2305
+ },
2306
+ {
2307
+ "epoch": 1.8292682926829267,
2308
+ "grad_norm": 10.679194450378418,
2309
+ "learning_rate": 2.1713906845682084e-05,
2310
+ "loss": 3.5509,
2311
+ "step": 8175
2312
+ },
2313
+ {
2314
+ "epoch": 1.834862385321101,
2315
+ "grad_norm": 18.6633243560791,
2316
+ "learning_rate": 2.1610309961876348e-05,
2317
+ "loss": 3.5166,
2318
+ "step": 8200
2319
+ },
2320
+ {
2321
+ "epoch": 1.840456477959275,
2322
+ "grad_norm": 7.9321112632751465,
2323
+ "learning_rate": 2.150671307807061e-05,
2324
+ "loss": 3.36,
2325
+ "step": 8225
2326
+ },
2327
+ {
2328
+ "epoch": 1.8460505705974493,
2329
+ "grad_norm": 10.757131576538086,
2330
+ "learning_rate": 2.1403116194264878e-05,
2331
+ "loss": 3.4811,
2332
+ "step": 8250
2333
+ },
2334
+ {
2335
+ "epoch": 1.8516446632356232,
2336
+ "grad_norm": 15.632428169250488,
2337
+ "learning_rate": 2.1299519310459144e-05,
2338
+ "loss": 3.5502,
2339
+ "step": 8275
2340
+ },
2341
+ {
2342
+ "epoch": 1.8572387558737973,
2343
+ "grad_norm": 19.17276954650879,
2344
+ "learning_rate": 2.1195922426653408e-05,
2345
+ "loss": 3.5601,
2346
+ "step": 8300
2347
+ },
2348
+ {
2349
+ "epoch": 1.8628328485119714,
2350
+ "grad_norm": 11.047025680541992,
2351
+ "learning_rate": 2.109232554284767e-05,
2352
+ "loss": 3.6373,
2353
+ "step": 8325
2354
+ },
2355
+ {
2356
+ "epoch": 1.8684269411501453,
2357
+ "grad_norm": 15.699575424194336,
2358
+ "learning_rate": 2.0988728659041938e-05,
2359
+ "loss": 3.7378,
2360
+ "step": 8350
2361
+ },
2362
+ {
2363
+ "epoch": 1.8740210337883196,
2364
+ "grad_norm": 13.09723949432373,
2365
+ "learning_rate": 2.08851317752362e-05,
2366
+ "loss": 3.8717,
2367
+ "step": 8375
2368
+ },
2369
+ {
2370
+ "epoch": 1.8796151264264935,
2371
+ "grad_norm": 8.441289901733398,
2372
+ "learning_rate": 2.0781534891430468e-05,
2373
+ "loss": 3.2558,
2374
+ "step": 8400
2375
+ },
2376
+ {
2377
+ "epoch": 1.8852092190646679,
2378
+ "grad_norm": 12.046778678894043,
2379
+ "learning_rate": 2.067793800762473e-05,
2380
+ "loss": 3.3936,
2381
+ "step": 8425
2382
+ },
2383
+ {
2384
+ "epoch": 1.8908033117028418,
2385
+ "grad_norm": 10.983031272888184,
2386
+ "learning_rate": 2.0574341123818998e-05,
2387
+ "loss": 3.4321,
2388
+ "step": 8450
2389
+ },
2390
+ {
2391
+ "epoch": 1.896397404341016,
2392
+ "grad_norm": 12.274590492248535,
2393
+ "learning_rate": 2.047074424001326e-05,
2394
+ "loss": 3.4638,
2395
+ "step": 8475
2396
+ },
2397
+ {
2398
+ "epoch": 1.90199149697919,
2399
+ "grad_norm": 15.135939598083496,
2400
+ "learning_rate": 2.0367147356207524e-05,
2401
+ "loss": 3.5686,
2402
+ "step": 8500
2403
+ },
2404
+ {
2405
+ "epoch": 1.907585589617364,
2406
+ "grad_norm": 11.194721221923828,
2407
+ "learning_rate": 2.026355047240179e-05,
2408
+ "loss": 3.7067,
2409
+ "step": 8525
2410
+ },
2411
+ {
2412
+ "epoch": 1.9131796822555382,
2413
+ "grad_norm": 15.062312126159668,
2414
+ "learning_rate": 2.0159953588596054e-05,
2415
+ "loss": 3.5299,
2416
+ "step": 8550
2417
+ },
2418
+ {
2419
+ "epoch": 1.9187737748937121,
2420
+ "grad_norm": 12.282342910766602,
2421
+ "learning_rate": 2.005635670479032e-05,
2422
+ "loss": 3.5803,
2423
+ "step": 8575
2424
+ },
2425
+ {
2426
+ "epoch": 1.9243678675318865,
2427
+ "grad_norm": 14.33022689819336,
2428
+ "learning_rate": 1.9952759820984584e-05,
2429
+ "loss": 3.5835,
2430
+ "step": 8600
2431
+ },
2432
+ {
2433
+ "epoch": 1.9299619601700604,
2434
+ "grad_norm": 8.249588966369629,
2435
+ "learning_rate": 1.984916293717885e-05,
2436
+ "loss": 3.2694,
2437
+ "step": 8625
2438
+ },
2439
+ {
2440
+ "epoch": 1.9355560528082345,
2441
+ "grad_norm": 9.1649169921875,
2442
+ "learning_rate": 1.9745566053373115e-05,
2443
+ "loss": 3.6105,
2444
+ "step": 8650
2445
+ },
2446
+ {
2447
+ "epoch": 1.9411501454464086,
2448
+ "grad_norm": 8.755537986755371,
2449
+ "learning_rate": 1.964196916956738e-05,
2450
+ "loss": 3.5554,
2451
+ "step": 8675
2452
+ },
2453
+ {
2454
+ "epoch": 1.9467442380845825,
2455
+ "grad_norm": 8.148399353027344,
2456
+ "learning_rate": 1.9538372285761645e-05,
2457
+ "loss": 3.5073,
2458
+ "step": 8700
2459
+ },
2460
+ {
2461
+ "epoch": 1.9523383307227569,
2462
+ "grad_norm": 10.864067077636719,
2463
+ "learning_rate": 1.943477540195591e-05,
2464
+ "loss": 3.6951,
2465
+ "step": 8725
2466
+ },
2467
+ {
2468
+ "epoch": 1.9579324233609308,
2469
+ "grad_norm": 13.049738883972168,
2470
+ "learning_rate": 1.9331178518150175e-05,
2471
+ "loss": 3.5186,
2472
+ "step": 8750
2473
+ },
2474
+ {
2475
+ "epoch": 1.963526515999105,
2476
+ "grad_norm": 5.955536842346191,
2477
+ "learning_rate": 1.922758163434444e-05,
2478
+ "loss": 3.479,
2479
+ "step": 8775
2480
+ },
2481
+ {
2482
+ "epoch": 1.969120608637279,
2483
+ "grad_norm": 12.35295581817627,
2484
+ "learning_rate": 1.9123984750538705e-05,
2485
+ "loss": 3.5103,
2486
+ "step": 8800
2487
+ },
2488
+ {
2489
+ "epoch": 1.9747147012754531,
2490
+ "grad_norm": 9.945602416992188,
2491
+ "learning_rate": 1.9020387866732968e-05,
2492
+ "loss": 3.6919,
2493
+ "step": 8825
2494
+ },
2495
+ {
2496
+ "epoch": 1.9803087939136272,
2497
+ "grad_norm": 9.716672897338867,
2498
+ "learning_rate": 1.8916790982927235e-05,
2499
+ "loss": 3.7097,
2500
+ "step": 8850
2501
+ },
2502
+ {
2503
+ "epoch": 1.9859028865518011,
2504
+ "grad_norm": 7.3651041984558105,
2505
+ "learning_rate": 1.88131940991215e-05,
2506
+ "loss": 3.4111,
2507
+ "step": 8875
2508
+ },
2509
+ {
2510
+ "epoch": 1.9914969791899755,
2511
+ "grad_norm": 11.258004188537598,
2512
+ "learning_rate": 1.8709597215315765e-05,
2513
+ "loss": 3.4201,
2514
+ "step": 8900
2515
+ },
2516
+ {
2517
+ "epoch": 1.9970910718281494,
2518
+ "grad_norm": 14.962812423706055,
2519
+ "learning_rate": 1.8606000331510028e-05,
2520
+ "loss": 3.4142,
2521
+ "step": 8925
2522
+ },
2523
+ {
2524
+ "epoch": 2.0,
2525
+ "eval_gen_len": 61.8854,
2526
+ "eval_loss": 4.248934268951416,
2527
+ "eval_rouge1": 25.7685,
2528
+ "eval_rouge2": 9.8226,
2529
+ "eval_rougeL": 24.6426,
2530
+ "eval_rougeLsum": 24.9756,
2531
+ "eval_runtime": 700.259,
2532
+ "eval_samples_per_second": 1.595,
2533
+ "eval_steps_per_second": 0.4,
2534
+ "step": 8938
2535
+ }
2536
+ ],
2537
+ "logging_steps": 25,
2538
+ "max_steps": 13407,
2539
+ "num_input_tokens_seen": 0,
2540
+ "num_train_epochs": 3,
2541
+ "save_steps": 500,
2542
+ "stateful_callbacks": {
2543
+ "EarlyStoppingCallback": {
2544
+ "args": {
2545
+ "early_stopping_patience": 5,
2546
+ "early_stopping_threshold": 0.01
2547
+ },
2548
+ "attributes": {
2549
+ "early_stopping_patience_counter": 0
2550
+ }
2551
+ },
2552
+ "TrainerControl": {
2553
+ "args": {
2554
+ "should_epoch_stop": false,
2555
+ "should_evaluate": false,
2556
+ "should_log": false,
2557
+ "should_save": true,
2558
+ "should_training_stop": false
2559
+ },
2560
+ "attributes": {}
2561
+ }
2562
+ },
2563
+ "total_flos": 1151792816578560.0,
2564
+ "train_batch_size": 2,
2565
+ "trial_name": null,
2566
+ "trial_params": null
2567
+ }
checkpoint-8938/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e34e737c610c7cc15ed5a164693fecd2333673e9691a688c4593b566063c81
3
+ size 5304
checkpoint-8938/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-large-cnn",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 12,
26
+ "eos_token_id": 2,
27
+ "force_bos_token_to_be_generated": true,
28
+ "forced_bos_token_id": 0,
29
+ "forced_eos_token_id": 2,
30
+ "gradient_checkpointing": false,
31
+ "id2label": {
32
+ "0": "LABEL_0",
33
+ "1": "LABEL_1",
34
+ "2": "LABEL_2"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1,
41
+ "LABEL_2": 2
42
+ },
43
+ "length_penalty": 2.0,
44
+ "max_length": 142,
45
+ "max_position_embeddings": 1024,
46
+ "min_length": 56,
47
+ "model_type": "bart",
48
+ "no_repeat_ngram_size": 3,
49
+ "normalize_before": false,
50
+ "num_beams": 4,
51
+ "num_hidden_layers": 12,
52
+ "output_past": true,
53
+ "pad_token_id": 1,
54
+ "prefix": " ",
55
+ "scale_embedding": false,
56
+ "task_specific_params": {
57
+ "summarization": {
58
+ "early_stopping": true,
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "no_repeat_ngram_size": 3,
63
+ "num_beams": 4
64
+ }
65
+ },
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.41.0",
68
+ "use_cache": true,
69
+ "vocab_size": 50265
70
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "early_stopping": true,
5
+ "eos_token_id": 2,
6
+ "forced_bos_token_id": 0,
7
+ "forced_eos_token_id": 2,
8
+ "length_penalty": 2.0,
9
+ "max_length": 142,
10
+ "min_length": 56,
11
+ "no_repeat_ngram_size": 3,
12
+ "num_beams": 4,
13
+ "pad_token_id": 1,
14
+ "transformers_version": "4.41.0",
15
+ "use_cache": false
16
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd6fefb4de9b6731b7efcc997fdb1cc4db285111506952feaefad76b0cf7029
3
+ size 1625426996
runs/Jun11_09-59-45_r-riken01-t5-base-njeyrhxb-70853-v6p9n/events.out.tfevents.1718099995.r-riken01-t5-base-njeyrhxb-70853-v6p9n.89.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22230db46118482acbeb9b3c3ed82309c92a46ecf320c1253b3dfff9d206673a
3
- size 116102
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1975b3f11670ffece3daafd9757177cb3a1e42fd643692d800cb91443b8c7d44
3
+ size 120990
runs/Jun11_09-59-45_r-riken01-t5-base-njeyrhxb-70853-v6p9n/events.out.tfevents.1718107440.r-riken01-t5-base-njeyrhxb-70853-v6p9n.89.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16a4b7dc3ee69997a329da242f480dc47666bf51ef5b91042c0df44fd10f6fdc
3
+ size 613
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "BartTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e34e737c610c7cc15ed5a164693fecd2333673e9691a688c4593b566063c81
3
+ size 5304
training_params.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_path": "autotrain-l6hey-orl0t/autotrain-data",
3
+ "model": "facebook/bart-large-cnn",
4
+ "username": "riken01",
5
+ "seed": 42,
6
+ "train_split": "train",
7
+ "valid_split": "validation",
8
+ "project_name": "autotrain-l6hey-orl0t",
9
+ "push_to_hub": true,
10
+ "text_column": "autotrain_text",
11
+ "target_column": "autotrain_label",
12
+ "lr": 5e-05,
13
+ "epochs": 3,
14
+ "max_seq_length": 128,
15
+ "max_target_length": 128,
16
+ "batch_size": 2,
17
+ "warmup_ratio": 0.1,
18
+ "gradient_accumulation": 1,
19
+ "optimizer": "adamw_torch",
20
+ "scheduler": "linear",
21
+ "weight_decay": 0.0,
22
+ "max_grad_norm": 1.0,
23
+ "logging_steps": -1,
24
+ "evaluation_strategy": "epoch",
25
+ "auto_find_batch_size": false,
26
+ "mixed_precision": "fp16",
27
+ "save_total_limit": 1,
28
+ "peft": false,
29
+ "quantization": "int4",
30
+ "lora_r": 16,
31
+ "lora_alpha": 32,
32
+ "lora_dropout": 0.05,
33
+ "target_modules": "all-linear",
34
+ "log": "tensorboard",
35
+ "early_stopping_patience": 5,
36
+ "early_stopping_threshold": 0.01
37
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff