|
from typing import List, Optional, Union |
|
|
|
|
|
from transformers.models.llama import LlamaTokenizerFast |
|
|
|
|
|
class DeepseekTokenizerFast(LlamaTokenizerFast): |
|
|
|
def convert_ids_to_tokens( |
|
self, ids: Union[int, List[int]], skip_special_tokens: bool = False |
|
) -> Union[str, List[str]]: |
|
""" |
|
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and |
|
added tokens. |
|
|
|
Args: |
|
ids (`int` or `List[int]`): |
|
The token id (or token ids) to convert to tokens. |
|
skip_special_tokens (`bool`, *optional*, defaults to `False`): |
|
Whether or not to remove special tokens in the decoding. |
|
|
|
Returns: |
|
`str` or `List[str]`: The decoded token(s). |
|
""" |
|
if isinstance(ids, int): |
|
return self._convert_id_to_token(ids) |
|
tokens = [] |
|
for index in ids: |
|
index = int(index) |
|
if skip_special_tokens and index in self.all_special_ids: |
|
continue |
|
token = self._tokenizer.id_to_token(index) |
|
tokens.append(token if token is not None else "") |
|
return tokens |
|
|
|
def _convert_id_to_token(self, index: int) -> Optional[str]: |
|
token = self._tokenizer.id_to_token(int(index)) |
|
return token if token is not None else "" |
|
|