Zorro123444 commited on
Commit
2b558d0
1 Parent(s): fc52e12

Upload tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -13,7 +13,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "!",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "!",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
tokenization_minicpmv_fast.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
4
+
5
+
6
+ class MiniCPMVTokenizerFast(PreTrainedTokenizerFast):
7
+ def __init__(self, **kwargs):
8
+ super().__init__(**kwargs)
9
+ self.eot_token = "<|eot_id|>"
10
+ self.im_start = "<image>"
11
+ self.im_end = "</image>"
12
+ self.ref_start = "<ref>"
13
+ self.ref_end = "</ref>"
14
+ self.box_start = "<box>"
15
+ self.box_end = "</box>"
16
+ self.quad_start = "<quad>"
17
+ self.quad_end = "</quad>"
18
+ self.slice_start = "<slice>"
19
+ self.slice_end = "</slice>"
20
+
21
+ @property
22
+ def eos_id(self):
23
+ return self.eos_token_id
24
+
25
+ @property
26
+ def bos_id(self):
27
+ return self.bos_token_id
28
+
29
+ @property
30
+ def unk_id(self):
31
+ return self.unk_token_id
32
+
33
+ @property
34
+ def eot_id(self):
35
+ return self.convert_tokens_to_ids(self.eot_token)
36
+
37
+ @property
38
+ def im_start_id(self):
39
+ return self.convert_tokens_to_ids(self.im_start)
40
+
41
+ @property
42
+ def im_end_id(self):
43
+ return self.convert_tokens_to_ids(self.im_end)
44
+
45
+ @staticmethod
46
+ def escape(text: str) -> str:
47
+ return text
48
+
49
+ @staticmethod
50
+ def unescape(text: str) -> str:
51
+ return text
tokenizer_config.json CHANGED
@@ -2059,8 +2059,8 @@
2059
  },
2060
  "auto_map": {
2061
  "AutoTokenizer": [
2062
- "openbmb/MiniCPM-Llama3-V-2_5--tokenization_minicpmv_fast.MiniCPMVTokenizerFast",
2063
- null
2064
  ]
2065
  },
2066
  "bos_token": "<|begin_of_text|>",
 
2059
  },
2060
  "auto_map": {
2061
  "AutoTokenizer": [
2062
+ null,
2063
+ "tokenization_minicpmv_fast.MiniCPMVTokenizerFast"
2064
  ]
2065
  },
2066
  "bos_token": "<|begin_of_text|>",