psinger commited on
Commit
49fb434
1 Parent(s): a18ce2b

Tokenization updates

Browse files

Some more suggestions for updates

Files changed (1) hide show
  1. tokenization_xgen.py +7 -6
tokenization_xgen.py CHANGED
@@ -115,7 +115,6 @@ class XgenTokenizer(PreTrainedTokenizer):
115
  def __init__(
116
  self,
117
  pad_token=None,
118
- eos_token="<|endoftext|>",
119
  add_eos_token=False,
120
  add_special_tokens=True,
121
  **kwargs,
@@ -149,20 +148,22 @@ class XgenTokenizer(PreTrainedTokenizer):
149
  def _convert_token_to_id(self, token):
150
  """Converts a token (str) in an id using the vocab."""
151
  if isinstance(token, str):
152
- ids = self._tokenize(token)
153
- return ids[0]
154
- return token
155
 
156
  def _convert_id_to_token(self, index):
157
  """Converts an index (integer) in a token (str) using the vocab."""
158
  return self.encoder.decode_single_token_bytes(index)
159
 
160
  def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
 
 
161
  return self.encoder.decode(token_ids)
162
 
163
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
164
  """Build model inputs from a sequence by appending eos_token_id."""
165
- eos_token_id = [50256] if self.add_eos_token else []
166
 
167
  output = token_ids_0 + eos_token_id
168
 
@@ -218,7 +219,7 @@ class XgenTokenizer(PreTrainedTokenizer):
218
  Returns:
219
  `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
220
  """
221
- eos_token_id = [50256] if self.add_eos_token else []
222
 
223
  output = [0] * len(token_ids_0 + eos_token_id)
224
 
 
115
  def __init__(
116
  self,
117
  pad_token=None,
 
118
  add_eos_token=False,
119
  add_special_tokens=True,
120
  **kwargs,
 
148
  def _convert_token_to_id(self, token):
149
  """Converts a token (str) in an id using the vocab."""
150
  if isinstance(token, str):
151
+ return self.encoder.encode_single_token(token)
152
+ else:
153
+ return token
154
 
155
  def _convert_id_to_token(self, index):
156
  """Converts an index (integer) in a token (str) using the vocab."""
157
  return self.encoder.decode_single_token_bytes(index)
158
 
159
  def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
160
+ if skip_special_tokens:
161
+ token_ids = [t for t in token_ids if t not in self.all_special_ids]
162
  return self.encoder.decode(token_ids)
163
 
164
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
165
  """Build model inputs from a sequence by appending eos_token_id."""
166
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
167
 
168
  output = token_ids_0 + eos_token_id
169
 
 
219
  Returns:
220
  `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
221
  """
222
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
223
 
224
  output = [0] * len(token_ids_0 + eos_token_id)
225