ai-forever commited on
Commit
1927423
1 Parent(s): 5e58709

Fix tokenization config based on github examples of usage

Browse files
Files changed (4) hide show
  1. config.json +3 -2
  2. special_tokens_map.json +11 -2
  3. tokenizer.json +46 -1
  4. tokenizer_config.json +60 -25
config.json CHANGED
@@ -5,9 +5,9 @@
5
  "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.1,
8
- "bos_token_id": 50256,
9
  "embd_pdrop": 0.1,
10
- "eos_token_id": 50256,
11
  "gradient_checkpointing": false,
12
  "initializer_range": 0.02,
13
  "layer_norm_epsilon": 1e-05,
@@ -18,6 +18,7 @@
18
  "n_inner": null,
19
  "n_layer": 40,
20
  "n_positions": 2048,
 
21
  "reorder_and_upcast_attn": false,
22
  "resid_pdrop": 0.1,
23
  "scale_attn_by_inverse_layer_idx": false,
 
5
  "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.1,
8
+ "bos_token_id": 0,
9
  "embd_pdrop": 0.1,
10
+ "eos_token_id": 5,
11
  "gradient_checkpointing": false,
12
  "initializer_range": 0.02,
13
  "layer_norm_epsilon": 1e-05,
 
18
  "n_inner": null,
19
  "n_layer": 40,
20
  "n_positions": 2048,
21
+ "pad_token_id": 1,
22
  "reorder_and_upcast_attn": false,
23
  "resid_pdrop": 0.1,
24
  "scale_attn_by_inverse_layer_idx": false,
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": {
3
- "content": "<|endoftext|>",
4
  "lstrip": false,
5
  "normalized": true,
6
  "rstrip": false,
@@ -13,8 +13,17 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
 
 
 
 
 
 
 
 
16
  "unk_token": {
17
- "content": "<|endoftext|>",
18
  "lstrip": false,
19
  "normalized": true,
20
  "rstrip": false,
 
1
  {
2
  "bos_token": {
3
+ "content": "<s>",
4
  "lstrip": false,
5
  "normalized": true,
6
  "rstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "mask_token": "<mask>",
17
+ "pad_token": {
18
+ "content": "<pad>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "sep_token": "</s>",
25
  "unk_token": {
26
+ "content": "<unk>",
27
  "lstrip": false,
28
  "normalized": true,
29
  "rstrip": false,
tokenizer.json CHANGED
@@ -3,6 +3,51 @@
3
  "truncation": null,
4
  "padding": null,
5
  "added_tokens": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  {
7
  "id": 5,
8
  "content": "<|endoftext|>",
@@ -199781,4 +199826,4 @@
199781
  "Ġget rouwd"
199782
  ]
199783
  }
199784
- }
 
3
  "truncation": null,
4
  "padding": null,
5
  "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<s>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": true,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "</s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": true,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<mask>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
  {
52
  "id": 5,
53
  "content": "<|endoftext|>",
 
199826
  "Ġget rouwd"
199827
  ]
199828
  }
199829
+ }
tokenizer_config.json CHANGED
@@ -1,33 +1,68 @@
1
  {
2
  "add_bos_token": false,
3
  "add_prefix_space": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "eos_token": {
13
- "__type": "AddedToken",
14
- "content": "<|endoftext|>",
15
- "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
 
 
 
20
  "errors": "replace",
 
 
21
  "name_or_path": "ai-forever/mGPT-13B",
22
- "pad_token": null,
23
- "special_tokens_map_file": null,
 
24
  "tokenizer_class": "GPT2Tokenizer",
25
- "unk_token": {
26
- "__type": "AddedToken",
27
- "content": "<|endoftext|>",
28
- "lstrip": false,
29
- "normalized": true,
30
- "rstrip": false,
31
- "single_word": false
32
- }
33
  }
 
1
  {
2
  "add_bos_token": false,
3
  "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<pad>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<|endoftext|>",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
  },
54
+ "bos_token": "<s>",
55
+ "clean_up_tokenization_spaces": true,
56
+ "eos_token": "<|endoftext|>",
57
  "errors": "replace",
58
+ "mask_token": "<mask>",
59
+ "model_max_length": 2048,
60
  "name_or_path": "ai-forever/mGPT-13B",
61
+ "pad_token": "<pad>",
62
+ "padding_side": "left",
63
+ "sep_token": "</s>",
64
  "tokenizer_class": "GPT2Tokenizer",
65
+ "truncation_side": "left",
66
+ "trust_remote_code": false,
67
+ "unk_token": "<unk>"
 
 
 
 
 
68
  }