rajammanabrolu commited on
Commit
2f824cd
1 Parent(s): 4c3f6b9

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 100279,
3
+ "<|im_start|>": 100278,
4
+ "<|pad|>": 100277
5
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": "<|pad|>",
21
+ "unk_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
tiktoken.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 MosaicML LLM Foundry authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import warnings
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
7
+ import torch
8
+ from transformers import PreTrainedTokenizer
9
+
10
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
11
+
12
+
13
+ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
14
+ """A thin wrapper around tiktoken to make it compatible with Hugging Face.
15
+
16
+ tokenizers.
17
+
18
+ See HuggingFace for further documentation on general tokenizer methods.
19
+ """
20
+
21
+ model_input_names = ['input_ids', 'attention_mask']
22
+
23
+ def __init__(self,
24
+ model_name: Optional[str] = None,
25
+ encoding_name: Optional[str] = None,
26
+ add_bos_token: bool = False,
27
+ add_eos_token: bool = False,
28
+ use_default_system_prompt: bool = False,
29
+ unk_token: Optional[str] = '<|endoftext|>',
30
+ eos_token: Optional[str] = '<|endoftext|>',
31
+ bos_token: Optional[str] = '<|endoftext|>',
32
+ pad_token: Optional[str] = None,
33
+ **kwargs: Any):
34
+ """Constructor creates a tiktoken tokenizer to use as the underlying.
35
+
36
+ tokenizer.
37
+
38
+ Args:
39
+ model_name (Optional[str], optional): The name of the model to load from tiktoken. Defaults to None.
40
+ Either model_name or encoding_name must be set, but not both.
41
+ encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None.
42
+ Either model_name or encoding_name must be set, but not both.
43
+ add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
44
+ add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
45
+ use_default_system_prompt (bool, optional): Use the default system prompt or not. Defaults to False.
46
+ unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
47
+ eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
48
+ bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
49
+ pad_token (Optional[str], optional): The pad token. Defaults to None.
50
+ """
51
+ try:
52
+ import tiktoken
53
+ except:
54
+ raise ImportError(
55
+ 'You need to install tiktoken to use TiktokenTokenizerWrapper.')
56
+
57
+ # Workaround to make tiktokenizer picklable.
58
+ # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347
59
+ # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181
60
+ import copyreg
61
+ import functools
62
+
63
+ from tiktoken import Encoding # type: ignore (thirdParty)
64
+
65
+ def pickle_Encoding(enc: Encoding):
66
+ return (functools.partial(Encoding,
67
+ enc.name,
68
+ pat_str=enc._pat_str,
69
+ mergeable_ranks=enc._mergeable_ranks,
70
+ special_tokens=enc._special_tokens), ())
71
+
72
+ copyreg.pickle(Encoding, pickle_Encoding)
73
+
74
+ if model_name is not None and encoding_name is not None:
75
+ raise ValueError(
76
+ 'You need to specify either model_name or encoding_name, not both.'
77
+ )
78
+
79
+ self.model_name = model_name
80
+ self.encoding_name = encoding_name
81
+
82
+ if self.model_name is not None:
83
+ self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty)
84
+ self.model_name)
85
+ elif self.encoding_name is not None:
86
+ self.encoding = tiktoken.get_encoding( # type: ignore (thirdParty)
87
+ self.encoding_name)
88
+ else:
89
+ raise ValueError(
90
+ 'You need to specify either model_name or encoding_name.')
91
+
92
+ self.add_bos_token = add_bos_token
93
+ self.add_eos_token = add_eos_token
94
+ self.use_default_system_prompt = use_default_system_prompt
95
+
96
+ super().__init__(model_name=model_name,
97
+ encoding_name=encoding_name,
98
+ add_bos_token=add_bos_token,
99
+ add_eos_token=add_eos_token,
100
+ use_default_system_prompt=use_default_system_prompt,
101
+ unk_token=unk_token,
102
+ eos_token=eos_token,
103
+ bos_token=bos_token,
104
+ pad_token=pad_token,
105
+ **kwargs)
106
+
107
+ @property
108
+ def vocab_size(self) -> int:
109
+ """Returns vocab size."""
110
+ return self.encoding.n_vocab
111
+
112
+ @property
113
+ def is_fast(self) -> bool:
114
+ return False
115
+
116
+ @property
117
+ def default_chat_template(self):
118
+ """Chat ML Template for User/Assistant.
119
+
120
+ Pinning default Chat ML template in case defaults change.
121
+ """
122
+ template = (
123
+ "{% set system_message = '' %}"
124
+ '{% if USE_DEFAULT_PROMPT == true %}'
125
+ "{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}"
126
+ '{% endif %}'
127
+ '{% for message in messages %}'
128
+ "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
129
+ '{% endfor %}')
130
+ template = template.replace(
131
+ 'USE_DEFAULT_PROMPT',
132
+ 'true' if self.use_default_system_prompt else 'false')
133
+ template = template.replace('DEFAULT_SYSTEM_PROMPT',
134
+ DEFAULT_SYSTEM_PROMPT)
135
+ return template
136
+
137
+ def get_vocab(self) -> Dict[str, int]:
138
+ """Returns vocab as a dict.
139
+
140
+ Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
141
+ Most uses do not need to use get_vocab, so this is not a priority to fix.
142
+ """
143
+ warnings.warn(
144
+ 'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.'
145
+ +
146
+ ' It will be called once init just to get the size of the vocab inside the base class.'
147
+ )
148
+
149
+ vocab = {}
150
+ for i in range(self.vocab_size):
151
+ try:
152
+ # need to try this first, so that we get a proper KeyError,
153
+ # otherwise it crashes in the rust code
154
+ _ = self.encoding.decode_single_token_bytes(i)
155
+ vocab[self.encoding.decode([i])] = i
156
+ except KeyError:
157
+ pass
158
+
159
+ # As far as I can tell, we don't require get_vocab to completely work,
160
+ # but when using additional_special_tokens, Hugging Face determines the next
161
+ # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
162
+ extra_id_index = 0
163
+ candidate_extra_id = f'<extra_id_{extra_id_index}>'
164
+ indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
165
+ vocab.values())
166
+
167
+ # Add enough indices to make get_vocab() the right length
168
+ for index_to_add in indices_to_fill_in:
169
+ # Make sure we don't overwrite a token that already exists
170
+ while candidate_extra_id in vocab:
171
+ extra_id_index += 1
172
+ candidate_extra_id = f'<extra_id_{extra_id_index}>'
173
+
174
+ # Get an index to add and add the item
175
+ vocab[candidate_extra_id] = index_to_add
176
+
177
+ return vocab
178
+
179
+ def _tokenize(self, text: str) -> List[int]:
180
+ """Returns a tokenized string.
181
+
182
+ Note: We have slightly redefined the expected contract between this method and
183
+ the _convert_token_to_id method. Normally, this method turns a string, into a list of strings,
184
+ and then the _convert_token_to_id method turns that list of strings into a list of integers.
185
+ However, not all vocab indices can be decoded into a string, so instead we just return the integers
186
+ from this function, and have adjusted the _convert_token_to_id method to handle integers as well as strings.
187
+ The only use of _tokenize that I could find was in this way, so this _should_ be safe.
188
+ """
189
+ if not isinstance(text, str):
190
+ raise ValueError(
191
+ f'Expected a string input to _tokenize but got {type(text)}.')
192
+
193
+ tokens = [t for t in self.encoding.encode(text, allowed_special='all')]
194
+
195
+ return tokens
196
+
197
+ def _convert_token_to_id(self, token: Union[int, str]) -> int:
198
+ """Converts a token (str) into an id using the vocab."""
199
+ if isinstance(token, int):
200
+ return token
201
+
202
+ return self.encoding.encode(token, allowed_special='all')[0]
203
+
204
+ def _convert_id_to_token(self, index: int) -> str:
205
+ """Converts an index (integer) into a token (str) using the vocab."""
206
+ return self.encoding.decode([index])
207
+
208
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
209
+ """Converts a sequence of tokens (string) in a single string."""
210
+ return ''.join(tokens)
211
+
212
+ def convert_ids_to_tokens(
213
+ self,
214
+ ids: Union[int, List[int]],
215
+ skip_special_tokens: bool = False) -> Union[str, List[str]]:
216
+ """Converts a single index or a sequence of indices into a token or a.
217
+
218
+ sequence of tokens, using the vocabulary and added tokens.
219
+
220
+ Args:
221
+ ids (`int` or `List[int]`):
222
+ The token id (or token ids) to convert to tokens.
223
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
224
+ Whether or not to remove special tokens in the decoding.
225
+
226
+ Returns:
227
+ `str` or `List[str]`: The decoded token(s).
228
+ """
229
+ if isinstance(ids, int):
230
+ if ids in self.added_tokens_decoder:
231
+ return str(self.added_tokens_decoder[ids])
232
+
233
+ return self._convert_id_to_token(ids)
234
+
235
+ # current_stream will collect multiple tokens, and then separately add items
236
+ # for each added token. This is done so that decode works properly with token ids
237
+ # that cannot be represented naively in utf-8.
238
+ tokens = []
239
+ current_stream = []
240
+ for index in ids:
241
+ if skip_special_tokens and index in self.all_special_ids:
242
+ continue
243
+
244
+ if index in self.added_tokens_decoder:
245
+ tokens.append(self.encoding.decode(current_stream))
246
+ current_stream = []
247
+ tokens.append(str(self.added_tokens_decoder[index]))
248
+ else:
249
+ current_stream.append(index)
250
+
251
+ if len(current_stream) > 0:
252
+ tokens.append(self.encoding.decode(current_stream))
253
+ return tokens
254
+
255
+ def build_inputs_with_special_tokens(
256
+ self,
257
+ token_ids_0: List[int],
258
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
259
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
260
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
261
+
262
+ output = bos_token_id + token_ids_0 + eos_token_id
263
+
264
+ if token_ids_1 is not None:
265
+ output = output + bos_token_id + token_ids_1 + eos_token_id
266
+
267
+ return output
268
+
269
+ def get_special_tokens_mask(
270
+ self,
271
+ token_ids_0: List[int],
272
+ token_ids_1: Optional[List[int]] = None,
273
+ already_has_special_tokens: bool = False) -> List[int]:
274
+ """Retrieves sequence ids from a token list that has no special tokens.
275
+
276
+ Function copied from
277
+ https://github.com/huggingface/transformers/blob/e3a4bd2bee212a2d0fd9f03b27fe7bfc1debe42d/src/transformers/models/gpt2/tokenization_gpt2.py#L265-L295
278
+
279
+ added. This method is called when adding special tokens using the
280
+ tokenizer `prepare_for_model` or `encode_plus` methods.
281
+
282
+ Args:
283
+ token_ids_0 (`List[int]`):
284
+ List of IDs.
285
+ token_ids_1 (`List[int]`, *optional*):
286
+ Optional second list of IDs for sequence pairs.
287
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
288
+ Whether or not the token list is already formatted with special tokens for the model.
289
+
290
+ Returns:
291
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
292
+ """
293
+ if already_has_special_tokens:
294
+ return super().get_special_tokens_mask(
295
+ token_ids_0=token_ids_0,
296
+ token_ids_1=token_ids_1,
297
+ already_has_special_tokens=True)
298
+
299
+ bos_token_id = [1] if self.add_bos_token else []
300
+ eos_token_id = [1] if self.add_eos_token else []
301
+
302
+ if token_ids_1 is None:
303
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
304
+ return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
305
+ bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
306
+
307
+ def create_token_type_ids_from_sequences(
308
+ self,
309
+ token_ids_0: List[int],
310
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
311
+ sep = [self.sep_token_id]
312
+
313
+ if token_ids_1 is None:
314
+ return len(token_ids_0 + sep) * [0]
315
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
316
+
317
+ def save_vocabulary(self,
318
+ save_directory: str,
319
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
320
+
321
+ # ignore the below type to keep the original signature
322
+ # we are knowingly breaking the signature here, although not 100% certain
323
+ # it doesn't have side effects
324
+ # There is some code in huggingface that calls this function to get the vocab files,
325
+ # but it doesn't seem to access them (or at least checks for their existence
326
+ # before accessing them)
327
+ return (None, None) # type: ignore
328
+
329
+ def sanitize_special_tokens(self) -> int:
330
+ """Make sure that all the special tokens attributes of the tokenizer.
331
+
332
+ (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the
333
+ vocabulary.
334
+
335
+ Add the missing ones to the vocabulary if needed.
336
+
337
+ Return:
338
+ `int`: The number of tokens added in the vocabulary during the operation.
339
+ """
340
+ actual_new_tokens = []
341
+ for token in self.all_special_tokens_extended:
342
+ encoded = self.encoding.encode(token, allowed_special='all')
343
+ if len(encoded) > 1:
344
+ actual_new_tokens.append(token)
345
+
346
+ return self.add_tokens(actual_new_tokens, special_tokens=True)
347
+
348
+ def construct_logit_tensor(self, logprobs: Dict[str,
349
+ float]) -> torch.Tensor:
350
+ """Construct tensor of shape (vocab_size,) mapping words to logprobs.
351
+
352
+ Args:
353
+ logprobs (Dict[str, float]): Dictionary mapping tokens to log probabilities assigned to them by the model.
354
+ """
355
+ tensor = torch.tensor([min(logprobs.values()) - 1] * (self.vocab_size))
356
+ for k in logprobs:
357
+ encoding = self(k)['input_ids']
358
+ idx = encoding[0]
359
+ tensor[idx] = logprobs[k]
360
+ return tensor
361
+
362
+
363
+ TiktokenTokenizerWrapper.register_for_auto_class()
tokenizer_config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "100257": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "100258": {
15
+ "content": "<|fim_prefix|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "100259": {
23
+ "content": "<|fim_middle|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "100260": {
31
+ "content": "<|fim_suffix|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "100276": {
39
+ "content": "<|endofprompt|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "100277": {
47
+ "content": "<|pad|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "100278": {
55
+ "content": "<|im_start|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "100279": {
63
+ "content": "<|im_end|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ }
70
+ },
71
+ "additional_special_tokens": [
72
+ "<|im_start|>",
73
+ "<|im_end|>"
74
+ ],
75
+ "auto_map": {
76
+ "AutoTokenizer": [
77
+ "tiktoken.TiktokenTokenizerWrapper",
78
+ null
79
+ ]
80
+ },
81
+ "bos_token": "<|endoftext|>",
82
+ "clean_up_tokenization_spaces": true,
83
+ "encoding_name": null,
84
+ "eos_token": "<|endoftext|>",
85
+ "model_max_length": 8192,
86
+ "model_name": "gpt-4",
87
+ "pad_token": "<|pad|>",
88
+ "tokenizer_class": "TiktokenTokenizerWrapper",
89
+ "unk_token": "<|endoftext|>",
90
+ "use_default_system_prompt": false
91
+ }