from transformers import PreTrainedTokenizer class CognitivessTokenizer(PreTrainedTokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder) def _tokenize(self, text): return text.split() def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): return " ".join(tokens) def save_vocabulary(self, save_directory): vocab_file = os.path.join(save_directory, "vocab.json") with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self.encoder, f, ensure_ascii=False) return (vocab_file,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] eos_token_id = [self.eos_token_id] return bos_token_id + token_ids_0 + eos_token_id