rajammanabrolu commited on
Commit
225a8dc
·
1 Parent(s): 5db0638

Update tiktoken.py

Browse files
Files changed (1) hide show
  1. tiktoken.py +78 -89
tiktoken.py CHANGED
@@ -1,8 +1,7 @@
1
  # Copyright 2022 MosaicML LLM Foundry authors
2
  # SPDX-License-Identifier: Apache-2.0
3
-
4
- import warnings
5
- from typing import Any, Dict, List, Optional, Tuple, Union
6
 
7
  import torch
8
  from transformers import PreTrainedTokenizer
@@ -10,6 +9,38 @@ from transformers import PreTrainedTokenizer
10
  DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class TiktokenTokenizerWrapper(PreTrainedTokenizer):
14
  """A thin wrapper around tiktoken to make it compatible with Hugging Face.
15
 
@@ -93,6 +124,28 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
93
  self.add_eos_token = add_eos_token
94
  self.use_default_system_prompt = use_default_system_prompt
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  super().__init__(model_name=model_name,
97
  encoding_name=encoding_name,
98
  add_bos_token=add_bos_token,
@@ -140,117 +193,53 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
140
  Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
141
  Most uses do not need to use get_vocab, so this is not a priority to fix.
142
  """
143
- warnings.warn(
144
- 'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.'
145
- +
146
- ' It will be called once init just to get the size of the vocab inside the base class.'
147
- )
148
-
149
- vocab = {}
150
- for i in range(self.vocab_size):
151
- try:
152
- # need to try this first, so that we get a proper KeyError,
153
- # otherwise it crashes in the rust code
154
- _ = self.encoding.decode_single_token_bytes(i)
155
- vocab[self.encoding.decode([i])] = i
156
- except KeyError:
157
- pass
158
-
159
  # As far as I can tell, we don't require get_vocab to completely work,
160
  # but when using additional_special_tokens, Hugging Face determines the next
161
  # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
 
162
  extra_id_index = 0
163
  candidate_extra_id = f'<extra_id_{extra_id_index}>'
164
  indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
165
- vocab.values())
166
 
167
  # Add enough indices to make get_vocab() the right length
168
  for index_to_add in indices_to_fill_in:
169
  # Make sure we don't overwrite a token that already exists
170
- while candidate_extra_id in vocab:
171
  extra_id_index += 1
172
  candidate_extra_id = f'<extra_id_{extra_id_index}>'
173
 
174
  # Get an index to add and add the item
175
- vocab[candidate_extra_id] = index_to_add
176
-
177
- return vocab
178
 
179
- def _tokenize(self, text: str) -> List[int]:
180
- """Returns a tokenized string.
181
 
182
- Note: We have slightly redefined the expected contract between this method and
183
- the _convert_token_to_id method. Normally, this method turns a string, into a list of strings,
184
- and then the _convert_token_to_id method turns that list of strings into a list of integers.
185
- However, not all vocab indices can be decoded into a string, so instead we just return the integers
186
- from this function, and have adjusted the _convert_token_to_id method to handle integers as well as strings.
187
- The only use of _tokenize that I could find was in this way, so this _should_ be safe.
188
- """
189
  if not isinstance(text, str):
190
  raise ValueError(
191
  f'Expected a string input to _tokenize but got {type(text)}.')
192
 
193
- tokens = [t for t in self.encoding.encode(text, allowed_special='all')]
 
 
 
194
 
195
  return tokens
196
 
197
- def _convert_token_to_id(self, token: Union[int, str]) -> int:
198
- """Converts a token (str) into an id using the vocab."""
199
- if isinstance(token, int):
200
- return token
201
-
202
- return self.encoding.encode(token, allowed_special='all')[0]
203
 
204
- def _convert_id_to_token(self, index: int) -> str:
205
- """Converts an index (integer) into a token (str) using the vocab."""
206
- return self.encoding.decode([index])
207
 
208
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
209
  """Converts a sequence of tokens (string) in a single string."""
210
- return ''.join(tokens)
211
-
212
- def convert_ids_to_tokens(
213
- self,
214
- ids: Union[int, List[int]],
215
- skip_special_tokens: bool = False) -> Union[str, List[str]]:
216
- """Converts a single index or a sequence of indices into a token or a.
217
-
218
- sequence of tokens, using the vocabulary and added tokens.
219
-
220
- Args:
221
- ids (`int` or `List[int]`):
222
- The token id (or token ids) to convert to tokens.
223
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
224
- Whether or not to remove special tokens in the decoding.
225
-
226
- Returns:
227
- `str` or `List[str]`: The decoded token(s).
228
- """
229
- if isinstance(ids, int):
230
- if ids in self.added_tokens_decoder:
231
- return str(self.added_tokens_decoder[ids])
232
-
233
- return self._convert_id_to_token(ids)
234
-
235
- # current_stream will collect multiple tokens, and then separately add items
236
- # for each added token. This is done so that decode works properly with token ids
237
- # that cannot be represented naively in utf-8.
238
- tokens = []
239
- current_stream = []
240
- for index in ids:
241
- if skip_special_tokens and index in self.all_special_ids:
242
- continue
243
-
244
- if index in self.added_tokens_decoder:
245
- tokens.append(self.encoding.decode(current_stream))
246
- current_stream = []
247
- tokens.append(str(self.added_tokens_decoder[index]))
248
- else:
249
- current_stream.append(index)
250
-
251
- if len(current_stream) > 0:
252
- tokens.append(self.encoding.decode(current_stream))
253
- return tokens
254
 
255
  def build_inputs_with_special_tokens(
256
  self,
@@ -360,4 +349,4 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
360
  return tensor
361
 
362
 
363
- TiktokenTokenizerWrapper.register_for_auto_class()
 
1
  # Copyright 2022 MosaicML LLM Foundry authors
2
  # SPDX-License-Identifier: Apache-2.0
3
+ from functools import lru_cache
4
+ from typing import Any, Dict, List, Optional, Tuple
 
5
 
6
  import torch
7
  from transformers import PreTrainedTokenizer
 
9
  DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
10
 
11
 
12
+ # Taken from
13
+ # https://github.com/huggingface/transformers/blob/8aca43bdb3cb9a5020f6d57589d85679dc873b1c/src/transformers/models/gpt2/tokenization_gpt2.py#L62-L84
14
+ @lru_cache()
15
+ def bytes_to_unicode():
16
+ """Returns list of utf-8 byte and a mapping to unicode strings.
17
+
18
+ We specifically avoids mapping to whitespace/control characters the bpe code
19
+ barfs on.
20
+
21
+ The reversible bpe codes work on unicode strings. This means you need a
22
+ large # of unicode characters in your vocab if you want to avoid UNKs. When
23
+ you're at something like a 10B token dataset you end up needing around 5K
24
+ for decent coverage. This is a significant percentage of your normal, say,
25
+ 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
26
+ unicode strings.
27
+ """
28
+ bs = (list(range(ord('!'),
29
+ ord('~') + 1)) + list(range(ord('¡'),
30
+ ord('¬') + 1)) +
31
+ list(range(ord('®'),
32
+ ord('ÿ') + 1)))
33
+ cs = bs[:]
34
+ n = 0
35
+ for b in range(2**8):
36
+ if b not in bs:
37
+ bs.append(b)
38
+ cs.append(2**8 + n)
39
+ n += 1
40
+ cs = [chr(n) for n in cs]
41
+ return dict(zip(bs, cs))
42
+
43
+
44
  class TiktokenTokenizerWrapper(PreTrainedTokenizer):
45
  """A thin wrapper around tiktoken to make it compatible with Hugging Face.
46
 
 
124
  self.add_eos_token = add_eos_token
125
  self.use_default_system_prompt = use_default_system_prompt
126
 
127
+ self.byte_encoder = bytes_to_unicode()
128
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
129
+
130
+ self.decoder = {}
131
+ for i in range(self.encoding.n_vocab):
132
+ try:
133
+ self.encoding.decode_single_token_bytes(i)
134
+ except KeyError:
135
+ continue
136
+ # Taken from
137
+ # https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = ''.join([
139
+ bytes_to_unicode()[ord(char)] for char in
140
+ self.encoding.decode_single_token_bytes(i).decode('latin-1')
141
+ ])
142
+ self.decoder[i] = decoding
143
+
144
+ self.encoder = {}
145
+ for i in range(self.encoding.n_vocab):
146
+ if i in self.decoder:
147
+ self.encoder[self.decoder[i]] = i
148
+
149
  super().__init__(model_name=model_name,
150
  encoding_name=encoding_name,
151
  add_bos_token=add_bos_token,
 
193
  Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
194
  Most uses do not need to use get_vocab, so this is not a priority to fix.
195
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  # As far as I can tell, we don't require get_vocab to completely work,
197
  # but when using additional_special_tokens, Hugging Face determines the next
198
  # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
199
+ vocab_clone = self.encoder.copy()
200
  extra_id_index = 0
201
  candidate_extra_id = f'<extra_id_{extra_id_index}>'
202
  indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
203
+ vocab_clone.values())
204
 
205
  # Add enough indices to make get_vocab() the right length
206
  for index_to_add in indices_to_fill_in:
207
  # Make sure we don't overwrite a token that already exists
208
+ while candidate_extra_id in vocab_clone:
209
  extra_id_index += 1
210
  candidate_extra_id = f'<extra_id_{extra_id_index}>'
211
 
212
  # Get an index to add and add the item
213
+ vocab_clone[candidate_extra_id] = index_to_add
 
 
214
 
215
+ return vocab_clone
 
216
 
217
+ def _tokenize(self, text: str) -> List[str]:
218
+ """Returns a tokenized string."""
 
 
 
 
 
219
  if not isinstance(text, str):
220
  raise ValueError(
221
  f'Expected a string input to _tokenize but got {type(text)}.')
222
 
223
+ tokens = [
224
+ self.decoder[t]
225
+ for t in self.encoding.encode(text, allowed_special='all')
226
+ ]
227
 
228
  return tokens
229
 
230
+ def _convert_token_to_id(self, token: str):
231
+ """Converts a token (str) in an id using the vocab."""
232
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
 
 
 
233
 
234
+ def _convert_id_to_token(self, index: int):
235
+ """Converts an index (integer) in a token (str) using the vocab."""
236
+ return self.decoder.get(index)
237
 
238
+ def convert_tokens_to_string(self, tokens: List[str]):
239
  """Converts a sequence of tokens (string) in a single string."""
240
+ text = ''.join(tokens)
241
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
242
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
  def build_inputs_with_special_tokens(
245
  self,
 
349
  return tensor
350
 
351
 
352
+ TiktokenTokenizerWrapper.register_for_auto_class()