smiled0g commited on
Commit
0160545
1 Parent(s): 4837650

End of training

Browse files
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: gpt2-large
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: tiq
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # tiq
15
+
16
+ This model is a fine-tuned version of [gpt2-large](https://huggingface.co/gpt2-large) on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 5.5477
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 5e-05
38
+ - train_batch_size: 8
39
+ - eval_batch_size: 8
40
+ - seed: 42
41
+ - gradient_accumulation_steps: 8
42
+ - total_train_batch_size: 64
43
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
+ - lr_scheduler_type: cosine
45
+ - lr_scheduler_warmup_steps: 200
46
+ - num_epochs: 1
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss |
52
+ |:-------------:|:-----:|:----:|:---------------:|
53
+ | 6.2342 | 0.04 | 100 | 6.1857 |
54
+ | 5.7599 | 0.07 | 200 | 5.7751 |
55
+ | 5.7433 | 0.11 | 300 | 5.7142 |
56
+ | 5.6021 | 0.15 | 400 | 5.6776 |
57
+ | 5.5084 | 0.18 | 500 | 5.6349 |
58
+ | 5.3825 | 0.22 | 600 | 5.6201 |
59
+ | 5.6698 | 0.26 | 700 | 5.5831 |
60
+ | 5.4089 | 0.29 | 800 | 5.5687 |
61
+ | 5.601 | 0.33 | 900 | 5.5574 |
62
+ | 5.4708 | 0.37 | 1000 | 5.5555 |
63
+ | 5.5956 | 0.4 | 1100 | 5.5520 |
64
+ | 5.4704 | 0.44 | 1200 | 5.5494 |
65
+ | 5.4824 | 0.47 | 1300 | 5.5502 |
66
+ | 5.589 | 0.51 | 1400 | 5.5478 |
67
+ | 5.5612 | 0.55 | 1500 | 5.5456 |
68
+ | 5.4741 | 0.58 | 1600 | 5.5430 |
69
+ | 5.463 | 0.62 | 1700 | 5.5426 |
70
+ | 5.5071 | 0.66 | 1800 | 5.5424 |
71
+ | 5.5469 | 0.69 | 1900 | 5.5419 |
72
+ | 5.4266 | 0.73 | 2000 | 5.5428 |
73
+ | 5.4848 | 0.77 | 2100 | 5.5438 |
74
+ | 5.5069 | 0.8 | 2200 | 5.5446 |
75
+ | 5.5885 | 0.84 | 2300 | 5.5469 |
76
+ | 5.4484 | 0.88 | 2400 | 5.5462 |
77
+ | 5.3859 | 0.91 | 2500 | 5.5475 |
78
+ | 5.465 | 0.95 | 2600 | 5.5476 |
79
+ | 5.4355 | 0.99 | 2700 | 5.5477 |
80
+
81
+
82
+ ### Framework versions
83
+
84
+ - Transformers 4.39.3
85
+ - Pytorch 2.2.0+cu121
86
+ - Datasets 2.18.0
87
+ - Tokenizers 0.15.2
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2-large",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 11012,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 11013,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 720,
15
+ "n_embd": 1280,
16
+ "n_head": 20,
17
+ "n_inner": null,
18
+ "n_layer": 36,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.39.3",
37
+ "use_cache": true,
38
+ "vocab_size": 11015
39
+ }
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  license: mit
3
- base_model: gpt2
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,9 +13,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # tiq
15
 
16
- This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 6.5928
19
 
20
  ## Model description
21
 
@@ -34,76 +34,49 @@ More information needed
34
  ### Training hyperparameters
35
 
36
  The following hyperparameters were used during training:
37
- - learning_rate: 1e-05
38
- - train_batch_size: 32
39
- - eval_batch_size: 32
40
  - seed: 42
41
- - gradient_accumulation_steps: 4
42
- - total_train_batch_size: 128
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: cosine
45
- - lr_scheduler_warmup_steps: 500
46
- - num_epochs: 2
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
- | 10.1294 | 0.04 | 100 | 10.1406 |
54
- | 9.5842 | 0.07 | 200 | 9.6140 |
55
- | 8.8547 | 0.11 | 300 | 8.8427 |
56
- | 8.0654 | 0.15 | 400 | 8.0300 |
57
- | 7.5124 | 0.18 | 500 | 7.4592 |
58
- | 7.3008 | 0.22 | 600 | 7.1798 |
59
- | 7.0833 | 0.26 | 700 | 7.0491 |
60
- | 7.0046 | 0.29 | 800 | 6.9639 |
61
- | 6.9422 | 0.33 | 900 | 6.9085 |
62
- | 6.9551 | 0.37 | 1000 | 6.8625 |
63
- | 6.8536 | 0.4 | 1100 | 6.8373 |
64
- | 6.8439 | 0.44 | 1200 | 6.8129 |
65
- | 6.7857 | 0.47 | 1300 | 6.7906 |
66
- | 6.8318 | 0.51 | 1400 | 6.7714 |
67
- | 6.7894 | 0.55 | 1500 | 6.7490 |
68
- | 6.6932 | 0.58 | 1600 | 6.7315 |
69
- | 6.7018 | 0.62 | 1700 | 6.7125 |
70
- | 6.671 | 0.66 | 1800 | 6.7045 |
71
- | 6.7686 | 0.69 | 1900 | 6.6873 |
72
- | 6.7236 | 0.73 | 2000 | 6.6767 |
73
- | 6.7334 | 0.77 | 2100 | 6.6702 |
74
- | 6.7135 | 0.8 | 2200 | 6.6624 |
75
- | 6.7133 | 0.84 | 2300 | 6.6653 |
76
- | 6.6295 | 0.88 | 2400 | 6.6520 |
77
- | 6.6343 | 0.91 | 2500 | 6.6496 |
78
- | 6.5874 | 0.95 | 2600 | 6.6456 |
79
- | 6.641 | 0.99 | 2700 | 6.6427 |
80
- | 6.59 | 1.02 | 2800 | 6.6377 |
81
- | 6.5958 | 1.06 | 2900 | 6.6378 |
82
- | 6.7154 | 1.1 | 3000 | 6.6313 |
83
- | 6.6053 | 1.13 | 3100 | 6.6305 |
84
- | 6.6077 | 1.17 | 3200 | 6.6242 |
85
- | 6.5719 | 1.21 | 3300 | 6.6202 |
86
- | 6.6981 | 1.24 | 3400 | 6.6228 |
87
- | 6.5717 | 1.28 | 3500 | 6.6177 |
88
- | 6.5864 | 1.31 | 3600 | 6.6139 |
89
- | 6.6584 | 1.35 | 3700 | 6.6109 |
90
- | 6.5598 | 1.39 | 3800 | 6.6103 |
91
- | 6.6571 | 1.42 | 3900 | 6.6063 |
92
- | 6.6377 | 1.46 | 4000 | 6.6039 |
93
- | 6.6071 | 1.5 | 4100 | 6.6025 |
94
- | 6.5311 | 1.53 | 4200 | 6.5994 |
95
- | 6.6616 | 1.57 | 4300 | 6.6000 |
96
- | 6.5725 | 1.61 | 4400 | 6.5976 |
97
- | 6.5851 | 1.64 | 4500 | 6.5963 |
98
- | 6.5723 | 1.68 | 4600 | 6.5952 |
99
- | 6.5369 | 1.72 | 4700 | 6.5951 |
100
- | 6.5928 | 1.75 | 4800 | 6.5950 |
101
- | 6.5366 | 1.79 | 4900 | 6.5940 |
102
- | 6.5188 | 1.83 | 5000 | 6.5932 |
103
- | 6.6146 | 1.86 | 5100 | 6.5929 |
104
- | 6.5728 | 1.9 | 5200 | 6.5931 |
105
- | 6.5463 | 1.94 | 5300 | 6.5931 |
106
- | 6.6269 | 1.97 | 5400 | 6.5928 |
107
 
108
 
109
  ### Framework versions
 
1
  ---
2
  license: mit
3
+ base_model: gpt2-large
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # tiq
15
 
16
+ This model is a fine-tuned version of [gpt2-large](https://huggingface.co/gpt2-large) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 5.5477
19
 
20
  ## Model description
21
 
 
34
  ### Training hyperparameters
35
 
36
  The following hyperparameters were used during training:
37
+ - learning_rate: 5e-05
38
+ - train_batch_size: 8
39
+ - eval_batch_size: 8
40
  - seed: 42
41
+ - gradient_accumulation_steps: 8
42
+ - total_train_batch_size: 64
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: cosine
45
+ - lr_scheduler_warmup_steps: 200
46
+ - num_epochs: 1
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 6.2342 | 0.04 | 100 | 6.1857 |
54
+ | 5.7599 | 0.07 | 200 | 5.7751 |
55
+ | 5.7433 | 0.11 | 300 | 5.7142 |
56
+ | 5.6021 | 0.15 | 400 | 5.6776 |
57
+ | 5.5084 | 0.18 | 500 | 5.6349 |
58
+ | 5.3825 | 0.22 | 600 | 5.6201 |
59
+ | 5.6698 | 0.26 | 700 | 5.5831 |
60
+ | 5.4089 | 0.29 | 800 | 5.5687 |
61
+ | 5.601 | 0.33 | 900 | 5.5574 |
62
+ | 5.4708 | 0.37 | 1000 | 5.5555 |
63
+ | 5.5956 | 0.4 | 1100 | 5.5520 |
64
+ | 5.4704 | 0.44 | 1200 | 5.5494 |
65
+ | 5.4824 | 0.47 | 1300 | 5.5502 |
66
+ | 5.589 | 0.51 | 1400 | 5.5478 |
67
+ | 5.5612 | 0.55 | 1500 | 5.5456 |
68
+ | 5.4741 | 0.58 | 1600 | 5.5430 |
69
+ | 5.463 | 0.62 | 1700 | 5.5426 |
70
+ | 5.5071 | 0.66 | 1800 | 5.5424 |
71
+ | 5.5469 | 0.69 | 1900 | 5.5419 |
72
+ | 5.4266 | 0.73 | 2000 | 5.5428 |
73
+ | 5.4848 | 0.77 | 2100 | 5.5438 |
74
+ | 5.5069 | 0.8 | 2200 | 5.5446 |
75
+ | 5.5885 | 0.84 | 2300 | 5.5469 |
76
+ | 5.4484 | 0.88 | 2400 | 5.5462 |
77
+ | 5.3859 | 0.91 | 2500 | 5.5475 |
78
+ | 5.465 | 0.95 | 2600 | 5.5476 |
79
+ | 5.4355 | 0.99 | 2700 | 5.5477 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  ### Framework versions
config.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
- "_name_or_path": "gpt2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.1,
8
- "bos_token_id": 48002,
9
  "embd_pdrop": 0.1,
10
- "eos_token_id": 48003,
11
  "initializer_range": 0.02,
12
  "layer_norm_epsilon": 1e-05,
13
  "model_type": "gpt2",
14
  "n_ctx": 720,
15
- "n_embd": 768,
16
- "n_head": 12,
17
  "n_inner": null,
18
- "n_layer": 12,
19
  "n_positions": 1024,
20
  "reorder_and_upcast_attn": false,
21
  "resid_pdrop": 0.1,
@@ -35,5 +35,5 @@
35
  "torch_dtype": "float32",
36
  "transformers_version": "4.39.3",
37
  "use_cache": true,
38
- "vocab_size": 48005
39
  }
 
1
  {
2
+ "_name_or_path": "gpt2-large",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.1,
8
+ "bos_token_id": 11012,
9
  "embd_pdrop": 0.1,
10
+ "eos_token_id": 11013,
11
  "initializer_range": 0.02,
12
  "layer_norm_epsilon": 1e-05,
13
  "model_type": "gpt2",
14
  "n_ctx": 720,
15
+ "n_embd": 1280,
16
+ "n_head": 20,
17
  "n_inner": null,
18
+ "n_layer": 36,
19
  "n_positions": 1024,
20
  "reorder_and_upcast_attn": false,
21
  "resid_pdrop": 0.1,
 
35
  "torch_dtype": "float32",
36
  "transformers_version": "4.39.3",
37
  "use_cache": true,
38
+ "vocab_size": 11015
39
  }
generation_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 48002,
4
- "eos_token_id": 48003,
5
  "transformers_version": "4.39.3"
6
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 11012,
4
+ "eos_token_id": 11013,
5
  "transformers_version": "4.39.3"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ac31958c9741942f7f461ab5ec997050cd64aa30f3c26572f1355ae5a0e7617
3
- size 490856064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc6d176fb87e4ec6ab3a46eb45eab6abb1b6d0178ddcb9b79bd62a496306e4f
3
+ size 2895246888
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "48004": {
4
  "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "11014": {
4
  "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cecb3a937044fdfdd46dc45d08a7538f60dba734a202cfc63b0429875eee4292
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5327a7f264d98a7468fa2e2b0dbffc5af82843afd083bc8123aff3bd849216
3
  size 4856