everdoubling
commited on
Commit
โข
19e0bc2
1
Parent(s):
5ce2421
Create tokenizer.py
Browse files- tokenizer.py +286 -0
tokenizer.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
#
|
3 |
+
# Everdoubling LLC.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
#
|
17 |
+
# The following code is modified from HuggingFace's ByT5 Tokenizer: transformers/models/byt5/tokenization_byt5.py
|
18 |
+
#
|
19 |
+
""" Tokenization class for model ByT5."""
|
20 |
+
|
21 |
+
|
22 |
+
import warnings
|
23 |
+
from typing import Dict, List, Optional, Tuple, Union
|
24 |
+
|
25 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
26 |
+
from transformers.models.byt5.tokenization_byt5 import ByT5Tokenizer
|
27 |
+
|
28 |
+
class ByT5KoreanTokenizer(PreTrainedTokenizer):
|
29 |
+
"""
|
30 |
+
Construct a ByT5Korean tokenizer.
|
31 |
+
On top of ByT5's simple raw bytes utf-8 encoding, ByT5Korean adds extra tokens for Korean jamo.
|
32 |
+
|
33 |
+
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
34 |
+
Users should refer to this superclass for more information regarding those methods.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
38 |
+
The end of sequence token.
|
39 |
+
|
40 |
+
.. note::
|
41 |
+
|
42 |
+
When building a sequence using special tokens, this is not the token that is used for the end of
|
43 |
+
sequence. The token used is the :obj:`sep_token`.
|
44 |
+
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
45 |
+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
46 |
+
token instead.
|
47 |
+
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
48 |
+
The token used for padding, for example when batching sequences of different lengths.
|
49 |
+
extra_ids (:obj:`int`, `optional`, defaults to 100):
|
50 |
+
Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
|
51 |
+
accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
|
52 |
+
indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
|
53 |
+
like in ByT5 preprocessing see `here
|
54 |
+
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
|
55 |
+
additional_special_tokens (:obj:`List[str]`, `optional`):
|
56 |
+
Additional special tokens used by the tokenizer.
|
57 |
+
"""
|
58 |
+
|
59 |
+
model_input_names = ["input_ids", "attention_mask"]
|
60 |
+
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
eos_token="</s>",
|
64 |
+
unk_token="<unk>",
|
65 |
+
pad_token="<pad>",
|
66 |
+
extra_ids=57,
|
67 |
+
additional_special_tokens=None,
|
68 |
+
**kwargs
|
69 |
+
) -> None:
|
70 |
+
# Add extra_ids to the special token list
|
71 |
+
if extra_ids > 0 and additional_special_tokens is None:
|
72 |
+
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
73 |
+
elif extra_ids > 0 and additional_special_tokens is not None:
|
74 |
+
# Check that we have the right number of extra_id special tokens
|
75 |
+
extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
|
76 |
+
if extra_tokens != extra_ids:
|
77 |
+
raise ValueError(
|
78 |
+
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are provided to ByT5Tokenizer. "
|
79 |
+
"In this case the additional_special_tokens must include the extra_ids tokens"
|
80 |
+
)
|
81 |
+
|
82 |
+
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
83 |
+
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
84 |
+
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
85 |
+
|
86 |
+
super().__init__(
|
87 |
+
eos_token=eos_token,
|
88 |
+
unk_token=unk_token,
|
89 |
+
pad_token=pad_token,
|
90 |
+
extra_ids=extra_ids,
|
91 |
+
additional_special_tokens=additional_special_tokens,
|
92 |
+
**kwargs,
|
93 |
+
)
|
94 |
+
|
95 |
+
self._extra_ids = extra_ids
|
96 |
+
|
97 |
+
# Add the special tokens (including extra_ids)
|
98 |
+
for token in self.all_special_tokens:
|
99 |
+
self.tokens_trie.add(token)
|
100 |
+
|
101 |
+
self._utf_vocab_size = 2 ** 8 # utf is 8 bits
|
102 |
+
self._utf_vocab_size += 19 + 21 + 28 # korean jamo
|
103 |
+
|
104 |
+
# define special tokens dict
|
105 |
+
self.special_tokens_encoder: Dict[int, str] = {
|
106 |
+
self.pad_token: 0,
|
107 |
+
self.eos_token: 1,
|
108 |
+
self.unk_token: 2,
|
109 |
+
}
|
110 |
+
self._num_special_tokens = len(self.special_tokens_encoder)
|
111 |
+
n = len(additional_special_tokens)
|
112 |
+
for i, token in enumerate(additional_special_tokens):
|
113 |
+
self.special_tokens_encoder[token] = self.vocab_size + i - n
|
114 |
+
self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()}
|
115 |
+
|
116 |
+
@property
|
117 |
+
def vocab_size(self):
|
118 |
+
return self._utf_vocab_size + self._num_special_tokens + self._extra_ids
|
119 |
+
|
120 |
+
def get_special_tokens_mask(
|
121 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
122 |
+
) -> List[int]:
|
123 |
+
"""
|
124 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
125 |
+
special tokens using the tokenizer ``prepare_for_model`` method.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
token_ids_0 (:obj:`List[int]`):
|
129 |
+
List of IDs.
|
130 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
131 |
+
Optional second list of IDs for sequence pairs.
|
132 |
+
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
133 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
134 |
+
|
135 |
+
Returns:
|
136 |
+
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
137 |
+
"""
|
138 |
+
if already_has_special_tokens:
|
139 |
+
return super().get_special_tokens_mask(
|
140 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
141 |
+
)
|
142 |
+
|
143 |
+
# normal case: some special tokens
|
144 |
+
if token_ids_1 is None:
|
145 |
+
return ([0] * len(token_ids_0)) + [1]
|
146 |
+
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
147 |
+
|
148 |
+
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
|
149 |
+
"""Do not add eos again if user already added it."""
|
150 |
+
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
|
151 |
+
warnings.warn(
|
152 |
+
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
|
153 |
+
)
|
154 |
+
return token_ids
|
155 |
+
else:
|
156 |
+
return token_ids + [self.eos_token_id]
|
157 |
+
|
158 |
+
def create_token_type_ids_from_sequences(
|
159 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
160 |
+
) -> List[int]:
|
161 |
+
"""
|
162 |
+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
|
163 |
+
make use of token type ids, therefore a list of zeros is returned.
|
164 |
+
|
165 |
+
Args:
|
166 |
+
token_ids_0 (:obj:`List[int]`):
|
167 |
+
List of IDs.
|
168 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
169 |
+
Optional second list of IDs for sequence pairs.
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
:obj:`List[int]`: List of zeros.
|
173 |
+
"""
|
174 |
+
eos = [self.eos_token_id]
|
175 |
+
|
176 |
+
if token_ids_1 is None:
|
177 |
+
return len(token_ids_0 + eos) * [0]
|
178 |
+
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
|
179 |
+
|
180 |
+
def build_inputs_with_special_tokens(
|
181 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
182 |
+
) -> List[int]:
|
183 |
+
"""
|
184 |
+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
185 |
+
adding special tokens. A sequence has the following format:
|
186 |
+
|
187 |
+
- single sequence: ``X </s>``
|
188 |
+
- pair of sequences: ``A </s> B </s>``
|
189 |
+
|
190 |
+
Args:
|
191 |
+
token_ids_0 (:obj:`List[int]`):
|
192 |
+
List of IDs to which the special tokens will be added.
|
193 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
194 |
+
Optional second list of IDs for sequence pairs.
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
198 |
+
"""
|
199 |
+
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
|
200 |
+
if token_ids_1 is None:
|
201 |
+
return token_ids_0
|
202 |
+
else:
|
203 |
+
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
|
204 |
+
return token_ids_0 + token_ids_1
|
205 |
+
|
206 |
+
def _convert_char_to_tokens_Korean(self, c):
|
207 |
+
o = ord(c)
|
208 |
+
if 44032 <= o and o <= 55203: # 44032: ๊ฐ, 55203: ํฃ
|
209 |
+
o -= 44032
|
210 |
+
return [chr(256 + (o // 588)), chr(256 + 19 + ((o % 588) // 28)), chr(256 + 19 + 21 + (o % 28))]
|
211 |
+
return [chr(i) for i in c.encode("utf-8")]
|
212 |
+
|
213 |
+
def _tokenize(self, text: str) -> List[str]:
|
214 |
+
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
|
215 |
+
if text in self.all_special_tokens:
|
216 |
+
return [text]
|
217 |
+
# return [self.special_tokens_encoder[text]]
|
218 |
+
# tokens = [chr(i) for i in text.encode("utf-8")]
|
219 |
+
# return tokens
|
220 |
+
return sum([self._convert_char_to_tokens_Korean(c) for c in text], [])
|
221 |
+
|
222 |
+
def _convert_token_to_id(self, token):
|
223 |
+
"""Converts a token (str) in an id using the vocab."""
|
224 |
+
if token in self.special_tokens_encoder:
|
225 |
+
token_id = self.special_tokens_encoder[token]
|
226 |
+
elif token in self.added_tokens_encoder:
|
227 |
+
token_id = self.added_tokens_encoder[token]
|
228 |
+
# else:
|
229 |
+
# token_id = token + self._num_special_tokens
|
230 |
+
elif len(token) != 1:
|
231 |
+
token_id = self.unk_token_id
|
232 |
+
else:
|
233 |
+
token_id = ord(token) + self._num_special_tokens
|
234 |
+
return token_id
|
235 |
+
|
236 |
+
def _convert_id_to_token(self, index):
|
237 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
238 |
+
if index in self.special_tokens_decoder:
|
239 |
+
token = self.special_tokens_decoder[index]
|
240 |
+
else:
|
241 |
+
token = chr(index - self._num_special_tokens)
|
242 |
+
return token
|
243 |
+
|
244 |
+
def convert_tokens_to_string(self, tokens):
|
245 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
246 |
+
bstring = b""
|
247 |
+
ids = [ord(t[0]) for t in tokens]
|
248 |
+
for i in range(len(ids)-2):
|
249 |
+
if 256 <= ids[i] and ids[i] < 256+19 and 256+19 <= ids[i+1] and ids[i+1] < 256+19+21 and 256+19+21 <= ids[i+2] and ids[i+2] < 256+19+21+28:
|
250 |
+
tokens[i] = chr(44032 + (ids[i]-256)*21*28 + (ids[i+1]-256-19)*28 + (ids[i+2]-256-19-21))
|
251 |
+
tokens[i+1] = None
|
252 |
+
tokens[i+2] = None
|
253 |
+
for token in tokens:
|
254 |
+
if token == None:
|
255 |
+
continue
|
256 |
+
if token in self.special_tokens_decoder:
|
257 |
+
tok_string = self.special_tokens_decoder[token].encode("utf-8")
|
258 |
+
elif token in self.added_tokens_decoder:
|
259 |
+
tok_string = self.special_tokens_decoder[token].encode("utf-8")
|
260 |
+
elif token in self.special_tokens_encoder:
|
261 |
+
tok_string = token.encode("utf-8")
|
262 |
+
elif token in self.added_tokens_encoder:
|
263 |
+
tok_string = token.encode("utf-8")
|
264 |
+
else:
|
265 |
+
if type(token) == str and ord(token) >= 256:
|
266 |
+
tok_string = token.encode("utf-8")
|
267 |
+
else:
|
268 |
+
tok_string = bytes([ord(token) if type(token) == str else min(255, token)])
|
269 |
+
bstring += tok_string
|
270 |
+
string = bstring.decode("utf-8", errors="ignore")
|
271 |
+
return string
|
272 |
+
|
273 |
+
# ByT5KoreanTokenizer has no vocab file
|
274 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
275 |
+
return ()
|
276 |
+
|
277 |
+
|
278 |
+
if __name__ == "__main__":
|
279 |
+
tokenizer = ByT5KoreanTokenizer()
|
280 |
+
text = "This is a test <extra_id_0> of the ๊ฐ๋ํฃ ์๋
ํ์ธ์ <extra_id_1>."
|
281 |
+
tokenized_text = tokenizer.tokenize(text)
|
282 |
+
print(tokenized_text)
|
283 |
+
print(tokenizer(text))
|
284 |
+
print(tokenizer.convert_tokens_to_ids(tokenized_text))
|
285 |
+
print(tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(tokenized_text)))
|
286 |
+
print(tokenizer.convert_tokens_to_string(tokenized_text))
|