File size: 1,151 Bytes
2fd97cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from transformers import PreTrainedTokenizer
class CognitivessTokenizer(PreTrainedTokenizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder)
def _tokenize(self, text):
return text.split()
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
return " ".join(tokens)
def save_vocabulary(self, save_directory):
vocab_file = os.path.join(save_directory, "vocab.json")
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self.encoder, f, ensure_ascii=False)
return (vocab_file,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id]
eos_token_id = [self.eos_token_id]
return bos_token_id + token_ids_0 + eos_token_id
|