Rename cognitivess_model/tokenization_Cognitivess.py to cognitivess_model/tokenization_cognitivess.py
2fd97cf
verified
from transformers import PreTrainedTokenizer | |
class CognitivessTokenizer(PreTrainedTokenizer): | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
def vocab_size(self): | |
return len(self.encoder) | |
def get_vocab(self): | |
return dict(self.encoder) | |
def _tokenize(self, text): | |
return text.split() | |
def _convert_token_to_id(self, token): | |
return self.encoder.get(token, self.encoder.get(self.unk_token)) | |
def _convert_id_to_token(self, index): | |
return self.decoder.get(index, self.unk_token) | |
def convert_tokens_to_string(self, tokens): | |
return " ".join(tokens) | |
def save_vocabulary(self, save_directory): | |
vocab_file = os.path.join(save_directory, "vocab.json") | |
with open(vocab_file, "w", encoding="utf-8") as f: | |
json.dump(self.encoder, f, ensure_ascii=False) | |
return (vocab_file,) | |
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): | |
bos_token_id = [self.bos_token_id] | |
eos_token_id = [self.eos_token_id] | |
return bos_token_id + token_ids_0 + eos_token_id | |