Lennart Keller commited on
Commit
46fe6f2
1 Parent(s): 3fdc44e

initial commit

Browse files
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ model-index:
5
+ - name: first
6
+ results: []
7
+ ---
8
+
9
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
10
+ should probably proofread and complete it, then remove this comment. -->
11
+
12
+ # first
13
+
14
+ This model is a fine-tuned version of [nystromformer-gottbert-base-8192](https://huggingface.co/nystromformer-gottbert-base-8192) on the None dataset.
15
+ It achieves the following results on the evaluation set:
16
+ - Loss: 1.5135
17
+
18
+ ## Model description
19
+
20
+ More information needed
21
+
22
+ ## Intended uses & limitations
23
+
24
+ More information needed
25
+
26
+ ## Training and evaluation data
27
+
28
+ More information needed
29
+
30
+ ## Training procedure
31
+
32
+ ### Training hyperparameters
33
+
34
+ The following hyperparameters were used during training:
35
+ - learning_rate: 3e-05
36
+ - train_batch_size: 2
37
+ - eval_batch_size: 4
38
+ - seed: 42
39
+ - gradient_accumulation_steps: 8
40
+ - total_train_batch_size: 16
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: linear
43
+ - lr_scheduler_warmup_steps: 500
44
+ - num_epochs: 3.0
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:-----:|:---------------:|
51
+ | 6.7133 | 0.1 | 500 | 6.6155 |
52
+ | 2.7876 | 0.2 | 1000 | 2.5542 |
53
+ | 2.1831 | 0.3 | 1500 | 2.0356 |
54
+ | 2.0316 | 0.4 | 2000 | 1.8793 |
55
+ | 2.0678 | 0.49 | 2500 | 1.7954 |
56
+ | 1.8182 | 0.59 | 3000 | 1.7473 |
57
+ | 1.7393 | 0.69 | 3500 | 1.7081 |
58
+ | 1.7586 | 0.79 | 4000 | 1.6787 |
59
+ | 1.7417 | 0.89 | 4500 | 1.6563 |
60
+ | 1.8256 | 0.99 | 5000 | 1.6370 |
61
+ | 1.7957 | 1.09 | 5500 | 1.6219 |
62
+ | 1.6876 | 1.19 | 6000 | 1.6084 |
63
+ | 1.7172 | 1.28 | 6500 | 1.5941 |
64
+ | 1.6564 | 1.38 | 7000 | 1.5881 |
65
+ | 1.732 | 1.48 | 7500 | 1.5757 |
66
+ | 1.8272 | 1.58 | 8000 | 1.5692 |
67
+ | 1.7951 | 1.68 | 8500 | 1.5617 |
68
+ | 1.6669 | 1.78 | 9000 | 1.5546 |
69
+ | 1.6489 | 1.88 | 9500 | 1.5458 |
70
+ | 1.772 | 1.98 | 10000 | 1.5439 |
71
+ | 1.7424 | 2.08 | 10500 | 1.5379 |
72
+ | 1.7077 | 2.17 | 11000 | 1.5322 |
73
+ | 1.6926 | 2.27 | 11500 | 1.5294 |
74
+ | 1.656 | 2.37 | 12000 | 1.5274 |
75
+ | 1.7002 | 2.47 | 12500 | 1.5201 |
76
+ | 1.7102 | 2.57 | 13000 | 1.5197 |
77
+ | 1.7158 | 2.67 | 13500 | 1.5162 |
78
+ | 1.6081 | 2.77 | 14000 | 1.5169 |
79
+ | 1.754 | 2.87 | 14500 | 1.5140 |
80
+ | 1.3588 | 2.96 | 15000 | 1.5135 |
81
+
82
+
83
+ ### Framework versions
84
+
85
+ - Transformers 4.16.2
86
+ - Pytorch 1.10.1+cu113
87
+ - Datasets 1.18.3
88
+ - Tokenizers 0.11.0
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 1.5135135650634766,
4
+ "eval_runtime": 2606.4957,
5
+ "eval_samples": 4150,
6
+ "eval_samples_per_second": 1.592,
7
+ "eval_steps_per_second": 0.398,
8
+ "perplexity": 4.542663731799982,
9
+ "train_loss": 1.0157656305313738,
10
+ "train_runtime": 373667.9433,
11
+ "train_samples": 80971,
12
+ "train_samples_per_second": 0.65,
13
+ "train_steps_per_second": 0.041
14
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nystromformer-gottbert-base-8192",
3
+ "architectures": [
4
+ "NystromformerForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "conv_kernel_size": 65,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 768,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "inv_coeff_init_option": false,
18
+ "layer_norm_eps": 1e-05,
19
+ "max_position_embeddings": 8192,
20
+ "model_type": "nystromformer",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 12,
23
+ "num_landmarks": 64,
24
+ "pad_token_id": 1,
25
+ "position_embedding_type": "absolute",
26
+ "segment_means_seq_len": 64,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.16.2",
29
+ "type_vocab_size": 1,
30
+ "use_cache": true,
31
+ "vocab_size": 52009
32
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 1.5135135650634766,
4
+ "eval_runtime": 2606.4957,
5
+ "eval_samples": 4150,
6
+ "eval_samples_per_second": 1.592,
7
+ "eval_steps_per_second": 0.398,
8
+ "perplexity": 4.542663731799982
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d963bab49b254e3b8e627950d6e9fa39d1d29cf71f64b481300ed1c92bda5f05
3
+ size 527939347
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "trim_offsets": true, "special_tokens_map_file": null, "name_or_path": "nystromformer-gottbert-base-8192", "tokenizer_class": "RobertaTokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 1.0157656305313738,
4
+ "train_runtime": 373667.9433,
5
+ "train_samples": 80971,
6
+ "train_samples_per_second": 0.65,
7
+ "train_steps_per_second": 0.041
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ace402ae3c56a39dc2924c4c0dd5253cb9688648e32b7ac2c64d8aea4c5eedc4
3
+ size 3119
vocab.json ADDED
The diff for this file is too large to render. See raw diff