implement _convert_id_to_token
Browse files- tokenization_qwen.py +28 -7
tokenization_qwen.py
CHANGED
@@ -78,7 +78,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
78 |
|
79 |
self.errors = errors # how to handle errors in decoding
|
80 |
|
81 |
-
name = "
|
82 |
ENDOFTEXT = "<|endoftext|>"
|
83 |
IMSTART = "<|im_start|>"
|
84 |
IMEND = "<|im_end|>"
|
@@ -181,10 +181,6 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
181 |
Args:
|
182 |
text (`str`):
|
183 |
The sequence to be encoded.
|
184 |
-
pair (`str`, *optional*):
|
185 |
-
A second sequence to be encoded with the first.
|
186 |
-
add_special_tokens (`bool`, *optional*, defaults to `False`):
|
187 |
-
Whether or not to add the special tokens associated with the corresponding model.
|
188 |
kwargs (additional keyword arguments, *optional*):
|
189 |
Will be passed to the underlying model specific encode method. See details in
|
190 |
[`~PreTrainedTokenizerBase.__call__`]
|
@@ -214,7 +210,31 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
214 |
return self.tokenizer.n_vocab
|
215 |
|
216 |
def _convert_id_to_token(self, index: int) -> str:
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
def _tokenize(self, text, **kwargs):
|
220 |
"""
|
@@ -229,9 +249,10 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
229 |
self,
|
230 |
token_ids: Union[int, List[int]],
|
231 |
skip_special_tokens: bool = False,
|
232 |
-
clean_up_tokenization_spaces: bool = None,
|
233 |
**kwargs,
|
234 |
) -> str:
|
235 |
if isinstance(token_ids, int):
|
236 |
token_ids = [token_ids]
|
|
|
|
|
237 |
return self.tokenizer.decode(token_ids)
|
|
|
78 |
|
79 |
self.errors = errors # how to handle errors in decoding
|
80 |
|
81 |
+
name = "Qwen"
|
82 |
ENDOFTEXT = "<|endoftext|>"
|
83 |
IMSTART = "<|im_start|>"
|
84 |
IMEND = "<|im_end|>"
|
|
|
181 |
Args:
|
182 |
text (`str`):
|
183 |
The sequence to be encoded.
|
|
|
|
|
|
|
|
|
184 |
kwargs (additional keyword arguments, *optional*):
|
185 |
Will be passed to the underlying model specific encode method. See details in
|
186 |
[`~PreTrainedTokenizerBase.__call__`]
|
|
|
210 |
return self.tokenizer.n_vocab
|
211 |
|
212 |
def _convert_id_to_token(self, index: int) -> str:
|
213 |
+
if index >= self.tokenizer.n_vocab:
|
214 |
+
return self.unk_token
|
215 |
+
return self.tokenizer.decode([index])
|
216 |
+
|
217 |
+
def _convert_token_to_id(self, token: str) -> int:
|
218 |
+
"""Converts a token to an id using the vocab."""
|
219 |
+
return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0])
|
220 |
+
|
221 |
+
@property
|
222 |
+
def all_special_tokens(self) -> List[str]:
|
223 |
+
"""
|
224 |
+
`List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
|
225 |
+
|
226 |
+
Convert tokens of `tokenizers.AddedToken` type to string.
|
227 |
+
"""
|
228 |
+
all_toks = [str(s) for s in self.special_tokens.keys()]
|
229 |
+
return all_toks
|
230 |
+
|
231 |
+
@property
|
232 |
+
def all_special_ids(self) -> List[int]:
|
233 |
+
"""
|
234 |
+
`List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
|
235 |
+
"""
|
236 |
+
all_ids = [v for v in self.special_tokens.values()]
|
237 |
+
return all_ids
|
238 |
|
239 |
def _tokenize(self, text, **kwargs):
|
240 |
"""
|
|
|
249 |
self,
|
250 |
token_ids: Union[int, List[int]],
|
251 |
skip_special_tokens: bool = False,
|
|
|
252 |
**kwargs,
|
253 |
) -> str:
|
254 |
if isinstance(token_ids, int):
|
255 |
token_ids = [token_ids]
|
256 |
+
if skip_special_tokens:
|
257 |
+
token_ids = [i for i in token_ids if i not in self.all_special_ids]
|
258 |
return self.tokenizer.decode(token_ids)
|