|
from transformers.models.auto.tokenization_auto import get_class_from_dynamic_module |
|
from transformers.tokenization_utils import AddedToken |
|
|
|
CodeGen25Tokenizer = get_class_from_dynamic_module("tokenization_codegen25.CodeGen25Tokenizer", |
|
"Salesforce/codegen25-7b-multi") |
|
tiktoken_tokenizer = get_class_from_dynamic_module("tokenization_codegen25.tiktoken_tokenizer", |
|
"Salesforce/codegen25-7b-multi") |
|
|
|
|
|
class DeciCoderTokenizer(CodeGen25Tokenizer): |
|
def __init__( |
|
self, |
|
pad_token=None, |
|
eos_token="<|endoftext|>", |
|
add_eos_token=False, |
|
add_special_tokens=True, |
|
**kwargs, |
|
): |
|
self.add_eos_token = add_eos_token |
|
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens) |
|
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token |
|
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token |
|
super().__init__( |
|
pad_token=pad_token_added, |
|
eos_token=eos_token_added, |
|
add_eos_token=add_eos_token, |
|
add_special_tokens=add_special_tokens, |
|
**kwargs, |
|
) |
|
|
|
def _convert_id_to_token(self, index): |
|
try: |
|
return super()._convert_id_to_token(index) |
|
except: |
|
return None |
|
|