jon-tow commited on
Commit
33c3426
·
verified ·
1 Parent(s): 8820aad

feat(tokenizer): expose merge ranks and special tokens for GGUF

Browse files
Files changed (1) hide show
  1. tokenization_arcade100k.py +5 -2
tokenization_arcade100k.py CHANGED
@@ -113,7 +113,7 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
113
  super().__init__(errors=errors, **kwargs)
114
  self._tiktoken_config = _arcade100k(vocab_file)
115
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
116
- self.errors = errors
117
  # TODO: Remove this assertion
118
  assert (
119
  len(self.tokenizer._mergeable_ranks)
@@ -126,6 +126,9 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
126
  self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
127
  self.eos_token = self.decoder[self.tokenizer.eot_token]
128
  self.pad_token = self.decoder[self.tokenizer.eot_token]
 
 
 
129
 
130
  def __len__(self):
131
  return self.tokenizer.n_vocab
@@ -270,4 +273,4 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
270
  token_ids = [token_ids]
271
  if skip_special_tokens:
272
  token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
273
- return self.tokenizer.decode(token_ids)
 
113
  super().__init__(errors=errors, **kwargs)
114
  self._tiktoken_config = _arcade100k(vocab_file)
115
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
116
+
117
  # TODO: Remove this assertion
118
  assert (
119
  len(self.tokenizer._mergeable_ranks)
 
126
  self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
127
  self.eos_token = self.decoder[self.tokenizer.eot_token]
128
  self.pad_token = self.decoder[self.tokenizer.eot_token]
129
+ # Expose for convenience
130
+ self.mergeable_ranks = self.tokenizer._mergeable_ranks
131
+ self.special_tokens = self.tokenizer._special_tokens
132
 
133
  def __len__(self):
134
  return self.tokenizer.n_vocab
 
273
  token_ids = [token_ids]
274
  if skip_special_tokens:
275
  token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
276
+ return self.tokenizer.decode(token_ids)