codebyzeb commited on
Commit
d852007
1 Parent(s): aa448ff

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -2
  2. tokenizer.json +2 -20
  3. tokenizer_config.json +4 -20
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "bos_token": "BOS",
3
- "eos_token": "EOS",
4
  "pad_token": "PAD",
5
  "unk_token": "UNK"
6
  }
 
1
  {
2
+ "bos_token": "UTT_BOUNDARY",
3
+ "eos_token": "UTT_BOUNDARY",
4
  "pad_token": "PAD",
5
  "unk_token": "UNK"
6
  }
tokenizer.json CHANGED
@@ -22,26 +22,8 @@
22
  "special": true
23
  },
24
  {
25
- "id": 2,
26
- "content": "BOS",
27
- "single_word": false,
28
- "lstrip": false,
29
- "rstrip": false,
30
- "normalized": false,
31
- "special": true
32
- },
33
- {
34
- "id": 3,
35
- "content": "EOS",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
- {
43
- "id": 196,
44
- "content": "<|endoftext|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
22
  "special": true
23
  },
24
  {
25
+ "id": 5,
26
+ "content": "UTT_BOUNDARY",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -17,24 +17,8 @@
17
  "single_word": false,
18
  "special": true
19
  },
20
- "2": {
21
- "content": "BOS",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "3": {
29
- "content": "EOS",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "196": {
37
- "content": "<|endoftext|>",
38
  "lstrip": false,
39
  "normalized": false,
40
  "rstrip": false,
@@ -42,9 +26,9 @@
42
  "special": true
43
  }
44
  },
45
- "bos_token": "BOS",
46
  "clean_up_tokenization_spaces": true,
47
- "eos_token": "EOS",
48
  "model_max_length": 1000000000000000019884624838656,
49
  "pad_token": "PAD",
50
  "tokenizer_class": "GPT2Tokenizer",
 
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "5": {
21
+ "content": "UTT_BOUNDARY",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
 
26
  "special": true
27
  }
28
  },
29
+ "bos_token": "UTT_BOUNDARY",
30
  "clean_up_tokenization_spaces": true,
31
+ "eos_token": "UTT_BOUNDARY",
32
  "model_max_length": 1000000000000000019884624838656,
33
  "pad_token": "PAD",
34
  "tokenizer_class": "GPT2Tokenizer",