from typing import List, Optional, Union | |
from transformers.models.llama import LlamaTokenizerFast | |
class DeepseekTokenizerFast(LlamaTokenizerFast): | |
def convert_ids_to_tokens( | |
self, ids: Union[int, List[int]], skip_special_tokens: bool = False | |
) -> Union[str, List[str]]: | |
""" | |
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and | |
added tokens. | |
Args: | |
ids (`int` or `List[int]`): | |
The token id (or token ids) to convert to tokens. | |
skip_special_tokens (`bool`, *optional*, defaults to `False`): | |
Whether or not to remove special tokens in the decoding. | |
Returns: | |
`str` or `List[str]`: The decoded token(s). | |
""" | |
if isinstance(ids, int): | |
return self._convert_id_to_token(ids) | |
tokens = [] | |
for index in ids: | |
index = int(index) | |
if skip_special_tokens and index in self.all_special_ids: | |
continue | |
token = self._tokenizer.id_to_token(index) | |
tokens.append(token if token is not None else "") | |
return tokens | |
def _convert_id_to_token(self, index: int) -> Optional[str]: | |
token = self._tokenizer.id_to_token(int(index)) | |
return token if token is not None else "" | |