from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.tokenization_utils import AddedToken _codegen_revision = dict(pretrained_model_name_or_path="Salesforce/codegen25-7b-multi", revision="d4dc9dd90e8b23d5411e6d970e3a11e88dc5c2bc") CodeGen25Tokenizer = get_class_from_dynamic_module( "tokenization_codegen25.CodeGen25Tokenizer", **_codegen_revision) tiktoken_tokenizer = get_class_from_dynamic_module( "tokenization_codegen25.tiktoken_tokenizer", **_codegen_revision) class DeciCoderTokenizer(CodeGen25Tokenizer): def __init__( self, pad_token=None, eos_token="<|endoftext|>", add_eos_token=False, add_special_tokens=True, **kwargs, ): self._tiktoken_kwargs = dict(base="gpt2", pad_token=pad_token, add_special=add_special_tokens) self.add_eos_token = add_eos_token self.encoder = tiktoken_tokenizer(**self._tiktoken_kwargs) pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token super().__init__( pad_token=pad_token_added, eos_token=eos_token_added, add_eos_token=add_eos_token, add_special_tokens=add_special_tokens, **kwargs, ) def _convert_id_to_token(self, index): """ bug fix in CodeGen25Tokenizer """ try: return super()._convert_id_to_token(index) except: return None def __getstate__(self): """ make the object picklable """ return {**self.__dict__, "encoder": None} def __setstate__(self, state): """ initialize tiktoken encoder after unpickling """ state["encoder"] = tiktoken_tokenizer(**state["_tiktoken_kwargs"]) self.__dict__ = state def save_pretrained(self, *args, **kwargs): """ add_special_tokens is not JSON serializable, which crashes save_pretrained(). Removing it from the tokenizer_config.json does not affect from_pretrained(). """ add_special_tokens = self.add_special_tokens self.add_special_tokens = None super().save_pretrained(*args, **kwargs) self.add_special_tokens = add_special_tokens