File size: 1,524 Bytes
e084f01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from transformers.models.auto.tokenization_auto import get_class_from_dynamic_module
from transformers.tokenization_utils import AddedToken
CodeGen25Tokenizer = get_class_from_dynamic_module("tokenization_codegen25.CodeGen25Tokenizer",
"Salesforce/codegen25-7b-multi")
tiktoken_tokenizer = get_class_from_dynamic_module("tokenization_codegen25.tiktoken_tokenizer",
"Salesforce/codegen25-7b-multi")
class DeciCoderTokenizer(CodeGen25Tokenizer):
def __init__(
self,
pad_token=None,
eos_token="<|endoftext|>",
add_eos_token=False,
add_special_tokens=True,
**kwargs,
):
self.add_eos_token = add_eos_token
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
super().__init__(
pad_token=pad_token_added,
eos_token=eos_token_added,
add_eos_token=add_eos_token,
add_special_tokens=add_special_tokens,
**kwargs,
)
def _convert_id_to_token(self, index):
try:
return super()._convert_id_to_token(index)
except:
return None
|