shaojiang commited on
Commit
68d5b33
1 Parent(s): 452509d

Upload 8 files

Browse files
cache/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
pretrained/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "embd_pdrop": 0.1,
8
+ "gradient_checkpointing": false,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_epsilon": 1e-05,
11
+ "model_type": "gpt2",
12
+ "n_ctx": 1024,
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "output_past": true,
19
+ "resid_pdrop": 0.1,
20
+ "task_specific_params": {
21
+ "text-generation": {
22
+ "do_sample": true,
23
+ "max_length": 400
24
+ }
25
+ },
26
+ "tokenizer_class": "BertTokenizer",
27
+ "vocab_size": 25370
28
+ }
pretrained/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9e148d4b3e4bfcc5e35383448896ed685fa00968fe3515b7b1aef5812f9c5a
3
+ size 433952719
tokenizations/__pycache__/tokenization_bert.cpython-39.pyc ADDED
Binary file (15.3 kB). View file
 
tokenizations/bpe_tokenizer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ from https://github.com/openai/gpt-2/, changed for chinese
3
+ """
4
+ import json
5
+ import os
6
+ import sentencepiece as spm
7
+ """
8
+ SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation
9
+ systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements
10
+ subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the
11
+ extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end
12
+ system that does not depend on language-specific pre/postprocessing.
13
+ https://github.com/google/sentencepiece
14
+
15
+ pip install sentencepiece
16
+
17
+ or git clone https://github.com/google/sentencepiece.git
18
+ python setup.py install
19
+
20
+ """
21
+
22
+ def get_pairs(word):
23
+ pairs = set()
24
+ prev_char = word[0]
25
+ for char in word[1:]:
26
+ pairs.add((prev_char, char))
27
+ prev_char = char
28
+ return pairs
29
+
30
+
31
+ class Encoder:
32
+ def __init__(self, encoder, bpe_merges):
33
+ self.encoder = encoder
34
+ self.decoder = {v: k for k, v in self.encoder.items()}
35
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
36
+ self.cache = {}
37
+ self.max_len = 0
38
+
39
+ def bpe(self, token):
40
+ if token in self.cache:
41
+ return self.cache[token]
42
+ word = tuple(token)
43
+ pairs = get_pairs(word)
44
+ if not pairs:
45
+ return token
46
+
47
+ while True:
48
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
49
+ if bigram not in self.bpe_ranks:
50
+ break
51
+ first, second = bigram
52
+ new_word = []
53
+ i = 0
54
+ while i < len(word):
55
+ try:
56
+ j = word.index(first, i)
57
+ new_word.extend(word[i:j])
58
+ i = j
59
+ except:
60
+ new_word.extend(word[i:])
61
+ break
62
+
63
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
64
+ new_word.append(first + second)
65
+ i += 2
66
+ else:
67
+ new_word.append(word[i])
68
+ i += 1
69
+ new_word = tuple(new_word)
70
+ word = new_word
71
+ if len(word) == 1:
72
+ break
73
+ else:
74
+ pairs = get_pairs(word)
75
+ word = ' '.join(word)
76
+ self.cache[token] = word
77
+ return word
78
+
79
+ def encode(self, text):
80
+ return [self.encoder.get(token, 1) for token in self.tokenize(text)]
81
+
82
+ def decode(self, tokens):
83
+ text = ''.join([self.decoder[token] for token in tokens])
84
+ return text
85
+
86
+ def tokenize(self, text):
87
+ bpe_tokens = []
88
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
89
+ return bpe_tokens
90
+
91
+ def convert_tokens_to_ids(self, tokens):
92
+ return [self.encoder.get(token, 1) for token in tokens]
93
+
94
+ class Encoder_SP:
95
+ def __init__(self, model_path):
96
+ self.sp = spm.SentencePieceProcessor()
97
+ self.sp.Load(model_path)
98
+
99
+
100
+ def encode(self, text):
101
+ """
102
+ text="...."
103
+ """
104
+ return self.sp.EncodeAsIds(text)
105
+
106
+
107
+ def decode(self, tokens):
108
+ """
109
+ tokens=[x1,x2,...]
110
+ """
111
+ text = [int(token) for token in tokens]
112
+ #print(text)
113
+ return self.sp.DecodeIds(text)
114
+
115
+ def tokenize(self, text):
116
+ return self.sp.EncodeAsPieces(text)
117
+
118
+ def convert_tokens_to_ids(self, tokens):
119
+ return [self.sp.PieceToId(token) for token in tokens]
120
+
121
+ def get_encoder(encoder_file, bpe_file):
122
+
123
+ #以下是为了同一个函数入兼容sentencepiece
124
+ filepath, filename = os.path.split(encoder_file)
125
+ shotname, extension = os.path.splitext(filename)
126
+
127
+ if(".model" == extension) and (bpe_file == ""):
128
+ return Encoder_SP(encoder_file)
129
+ else:
130
+ with open(encoder_file, 'r', encoding="utf-8") as f:
131
+ encoder = json.load(f)
132
+ with open(bpe_file, 'r', encoding="utf-8") as f:
133
+ bpe_data = f.read()
134
+ bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
135
+ return Encoder(
136
+ encoder=encoder,
137
+ bpe_merges=bpe_merges,
138
+ )
139
+
140
+
141
+
142
+
tokenizations/thulac_dict/seg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [SEP]
2
+ [PAD]
3
+ [CLS]
4
+ [UNK]
5
+ [MASK]
tokenizations/tokenization_bert.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes."""
16
+
17
+ from __future__ import absolute_import, division, print_function, unicode_literals
18
+
19
+ import collections
20
+ import logging
21
+ import os
22
+ import unicodedata
23
+ from io import open
24
+
25
+ from transformers.tokenization_utils import PreTrainedTokenizer
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30
+
31
+ PRETRAINED_VOCAB_FILES_MAP = {
32
+ 'vocab_file':
33
+ {
34
+ 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35
+ 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36
+ 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
37
+ 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
38
+ 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
39
+ 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
40
+ 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
41
+ 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
42
+ 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
43
+ 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
44
+ 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
45
+ 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
46
+ 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
47
+ }
48
+ }
49
+
50
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
51
+ 'bert-base-uncased': 512,
52
+ 'bert-large-uncased': 512,
53
+ 'bert-base-cased': 512,
54
+ 'bert-large-cased': 512,
55
+ 'bert-base-multilingual-uncased': 512,
56
+ 'bert-base-multilingual-cased': 512,
57
+ 'bert-base-chinese': 512,
58
+ 'bert-base-german-cased': 512,
59
+ 'bert-large-uncased-whole-word-masking': 512,
60
+ 'bert-large-cased-whole-word-masking': 512,
61
+ 'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
62
+ 'bert-large-cased-whole-word-masking-finetuned-squad': 512,
63
+ 'bert-base-cased-finetuned-mrpc': 512,
64
+ }
65
+
66
+ def load_vocab(vocab_file):
67
+ """Loads a vocabulary file into a dictionary."""
68
+ vocab = collections.OrderedDict()
69
+ with open(vocab_file, "r", encoding="utf-8") as reader:
70
+ tokens = reader.readlines()
71
+ for index, token in enumerate(tokens):
72
+ token = token.rstrip('\n')
73
+ vocab[token] = index
74
+ return vocab
75
+
76
+
77
+ def whitespace_tokenize(text):
78
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
79
+ text = text.strip()
80
+ if not text:
81
+ return []
82
+ tokens = text.split()
83
+ return tokens
84
+
85
+
86
+ class BertTokenizer(PreTrainedTokenizer):
87
+ r"""
88
+ Constructs a BertTokenizer.
89
+ :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
90
+
91
+ Args:
92
+ vocab_file: Path to a one-wordpiece-per-line vocabulary file
93
+ do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
94
+ do_basic_tokenize: Whether to do basic tokenization before wordpiece.
95
+ max_len: An artificial maximum length to truncate tokenized_doupo sequences to; Effective maximum length is always the
96
+ minimum of this value (if specified) and the underlying BERT model's sequence length.
97
+ never_split: List of tokens which will never be split during tokenization. Only has an effect when
98
+ do_wordpiece_only=False
99
+ """
100
+
101
+ vocab_files_names = VOCAB_FILES_NAMES
102
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
103
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
104
+
105
+ def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
106
+ unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
107
+ mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
108
+ """Constructs a BertTokenizer.
109
+
110
+ Args:
111
+ **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
112
+ **do_lower_case**: (`optional`) boolean (default True)
113
+ Whether to lower case the input
114
+ Only has an effect when do_basic_tokenize=True
115
+ **do_basic_tokenize**: (`optional`) boolean (default True)
116
+ Whether to do basic tokenization before wordpiece.
117
+ **never_split**: (`optional`) list of string
118
+ List of tokens which will never be split during tokenization.
119
+ Only has an effect when do_basic_tokenize=True
120
+ **tokenize_chinese_chars**: (`optional`) boolean (default True)
121
+ Whether to tokenize Chinese characters.
122
+ This should likely be desactivated for Japanese:
123
+ see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
124
+ """
125
+ super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
126
+ pad_token=pad_token, cls_token=cls_token,
127
+ mask_token=mask_token, **kwargs)
128
+ if not os.path.isfile(vocab_file):
129
+ raise ValueError(
130
+ "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
131
+ "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
132
+ self.vocab = load_vocab(vocab_file)
133
+ self.ids_to_tokens = collections.OrderedDict(
134
+ [(ids, tok) for tok, ids in self.vocab.items()])
135
+ self.do_basic_tokenize = do_basic_tokenize
136
+ if do_basic_tokenize:
137
+ self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
138
+ never_split=never_split,
139
+ tokenize_chinese_chars=tokenize_chinese_chars)
140
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
141
+
142
+ @property
143
+ def vocab_size(self):
144
+ return len(self.vocab)
145
+
146
+ def _tokenize(self, text):
147
+ split_tokens = []
148
+ if self.do_basic_tokenize:
149
+ for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
150
+ for sub_token in self.wordpiece_tokenizer.tokenize(token):
151
+ split_tokens.append(sub_token)
152
+ else:
153
+ split_tokens = self.wordpiece_tokenizer.tokenize(text)
154
+ return split_tokens
155
+
156
+ def _convert_token_to_id(self, token):
157
+ """ Converts a token (str/unicode) in an id using the vocab. """
158
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
159
+
160
+ def _convert_id_to_token(self, index):
161
+ """Converts an index (integer) in a token (string/unicode) using the vocab."""
162
+ return self.ids_to_tokens.get(index, self.unk_token)
163
+
164
+ def convert_tokens_to_string(self, tokens):
165
+ """ Converts a sequence of tokens (string) in a single string. """
166
+ out_string = ' '.join(tokens).replace(' ##', '').strip()
167
+ return out_string
168
+
169
+ def save_vocabulary(self, vocab_path):
170
+ """Save the tokenizer vocabulary to a directory or file."""
171
+ index = 0
172
+ if os.path.isdir(vocab_path):
173
+ vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
174
+ with open(vocab_file, "w", encoding="utf-8") as writer:
175
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
176
+ if index != token_index:
177
+ logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
178
+ " Please check that the vocabulary is not corrupted!".format(vocab_file))
179
+ index = token_index
180
+ writer.write(token + u'\n')
181
+ index += 1
182
+ return (vocab_file,)
183
+
184
+ @classmethod
185
+ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
186
+ """ Instantiate a BertTokenizer from pre-trained vocabulary files.
187
+ """
188
+ if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
189
+ if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
190
+ logger.warning("The pre-trained model you are loading is a cased model but you have not set "
191
+ "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
192
+ "you may want to check this behavior.")
193
+ kwargs['do_lower_case'] = False
194
+ elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
195
+ logger.warning("The pre-trained model you are loading is an uncased model but you have set "
196
+ "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
197
+ "but you may want to check this behavior.")
198
+ kwargs['do_lower_case'] = True
199
+
200
+ return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
201
+
202
+
203
+ class BasicTokenizer(object):
204
+ """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
205
+
206
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
207
+ """ Constructs a BasicTokenizer.
208
+
209
+ Args:
210
+ **do_lower_case**: Whether to lower case the input.
211
+ **never_split**: (`optional`) list of str
212
+ Kept for backward compatibility purposes.
213
+ Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
214
+ List of token not to split.
215
+ **tokenize_chinese_chars**: (`optional`) boolean (default True)
216
+ Whether to tokenize Chinese characters.
217
+ This should likely be desactivated for Japanese:
218
+ see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
219
+ """
220
+ if never_split is None:
221
+ never_split = []
222
+ self.do_lower_case = do_lower_case
223
+ self.never_split = never_split
224
+ self.tokenize_chinese_chars = tokenize_chinese_chars
225
+
226
+ def tokenize(self, text, never_split=None):
227
+ """ Basic Tokenization of a piece of text.
228
+ Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
229
+
230
+ Args:
231
+ **never_split**: (`optional`) list of str
232
+ Kept for backward compatibility purposes.
233
+ Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
234
+ List of token not to split.
235
+ """
236
+ never_split = self.never_split + (never_split if never_split is not None else [])
237
+ text = self._clean_text(text)
238
+ # This was added on November 1st, 2018 for the multilingual and Chinese
239
+ # models. This is also applied to the English models now, but it doesn't
240
+ # matter since the English models were not trained on any Chinese data
241
+ # and generally don't have any Chinese data in them (there are Chinese
242
+ # characters in the vocabulary because Wikipedia does have some Chinese
243
+ # words in the English Wikipedia.).
244
+ if self.tokenize_chinese_chars:
245
+ text = self._tokenize_chinese_chars(text)
246
+ orig_tokens = whitespace_tokenize(text)
247
+ split_tokens = []
248
+ for token in orig_tokens:
249
+ if self.do_lower_case and token not in never_split:
250
+ token = token.lower()
251
+ token = self._run_strip_accents(token)
252
+ split_tokens.extend(self._run_split_on_punc(token))
253
+
254
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
255
+ return output_tokens
256
+
257
+ def _run_strip_accents(self, text):
258
+ """Strips accents from a piece of text."""
259
+ text = unicodedata.normalize("NFD", text)
260
+ output = []
261
+ for char in text:
262
+ cat = unicodedata.category(char)
263
+ if cat == "Mn":
264
+ continue
265
+ output.append(char)
266
+ return "".join(output)
267
+
268
+ def _run_split_on_punc(self, text, never_split=None):
269
+ """Splits punctuation on a piece of text."""
270
+ if never_split is not None and text in never_split:
271
+ return [text]
272
+ chars = list(text)
273
+ i = 0
274
+ start_new_word = True
275
+ output = []
276
+ while i < len(chars):
277
+ char = chars[i]
278
+ if _is_punctuation(char):
279
+ output.append([char])
280
+ start_new_word = True
281
+ else:
282
+ if start_new_word:
283
+ output.append([])
284
+ start_new_word = False
285
+ output[-1].append(char)
286
+ i += 1
287
+
288
+ return ["".join(x) for x in output]
289
+
290
+ def _tokenize_chinese_chars(self, text):
291
+ """Adds whitespace around any CJK character."""
292
+ output = []
293
+ for char in text:
294
+ cp = ord(char)
295
+ if self._is_chinese_char(cp) or char.isdigit():
296
+ output.append(" ")
297
+ output.append(char)
298
+ output.append(" ")
299
+ else:
300
+ output.append(char)
301
+ return "".join(output)
302
+
303
+ def _is_chinese_char(self, cp):
304
+ """Checks whether CP is the codepoint of a CJK character."""
305
+ # This defines a "chinese character" as anything in the CJK Unicode block:
306
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
307
+ #
308
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
309
+ # despite its name. The modern Korean Hangul alphabet is a different block,
310
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
311
+ # space-separated words, so they are not treated specially and handled
312
+ # like the all of the other languages.
313
+ if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
314
+ (cp >= 0x3400 and cp <= 0x4DBF) or #
315
+ (cp >= 0x20000 and cp <= 0x2A6DF) or #
316
+ (cp >= 0x2A700 and cp <= 0x2B73F) or #
317
+ (cp >= 0x2B740 and cp <= 0x2B81F) or #
318
+ (cp >= 0x2B820 and cp <= 0x2CEAF) or
319
+ (cp >= 0xF900 and cp <= 0xFAFF) or #
320
+ (cp >= 0x2F800 and cp <= 0x2FA1F)): #
321
+ return True
322
+
323
+ return False
324
+
325
+ def _clean_text(self, text):
326
+ """Performs invalid character removal and whitespace cleanup on text."""
327
+ output = []
328
+ for char in text:
329
+ cp = ord(char)
330
+ if cp == 0 or cp == 0xfffd or _is_control(char):
331
+ continue
332
+ if _is_whitespace(char):
333
+ output.append(" ")
334
+ else:
335
+ output.append(char)
336
+ return "".join(output)
337
+
338
+
339
+ class WordpieceTokenizer(object):
340
+ """Runs WordPiece tokenization."""
341
+
342
+ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
343
+ self.vocab = vocab
344
+ self.unk_token = unk_token
345
+ self.max_input_chars_per_word = max_input_chars_per_word
346
+
347
+ def tokenize(self, text):
348
+ """Tokenizes a piece of text into its word pieces.
349
+
350
+ This uses a greedy longest-match-first algorithm to perform tokenization
351
+ using the given vocabulary.
352
+
353
+ For example:
354
+ input = "unaffable"
355
+ output = ["un", "##aff", "##able"]
356
+
357
+ Args:
358
+ text: A single token or whitespace separated tokens. This should have
359
+ already been passed through `BasicTokenizer`.
360
+
361
+ Returns:
362
+ A list of wordpiece tokens.
363
+ """
364
+
365
+ output_tokens = []
366
+ for token in whitespace_tokenize(text):
367
+ chars = list(token)
368
+ if len(chars) > self.max_input_chars_per_word:
369
+ output_tokens.append(self.unk_token)
370
+ continue
371
+
372
+ is_bad = False
373
+ start = 0
374
+ sub_tokens = []
375
+ while start < len(chars):
376
+ end = len(chars)
377
+ cur_substr = None
378
+ while start < end:
379
+ substr = "".join(chars[start:end])
380
+ if start > 0:
381
+ substr = "##" + substr
382
+ if substr in self.vocab:
383
+ cur_substr = substr
384
+ break
385
+ end -= 1
386
+ if cur_substr is None:
387
+ is_bad = True
388
+ break
389
+ sub_tokens.append(cur_substr)
390
+ start = end
391
+
392
+ if is_bad:
393
+ output_tokens.append(self.unk_token)
394
+ else:
395
+ output_tokens.extend(sub_tokens)
396
+ return output_tokens
397
+
398
+
399
+ def _is_whitespace(char):
400
+ """Checks whether `chars` is a whitespace character."""
401
+ # \t, \n, and \r are technically contorl characters but we treat them
402
+ # as whitespace since they are generally considered as such.
403
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
404
+ return True
405
+ cat = unicodedata.category(char)
406
+ if cat == "Zs":
407
+ return True
408
+ return False
409
+
410
+
411
+ def _is_control(char):
412
+ """Checks whether `chars` is a control character."""
413
+ # These are technically control characters but we count them as whitespace
414
+ # characters.
415
+ if char == "\t" or char == "\n" or char == "\r":
416
+ return False
417
+ cat = unicodedata.category(char)
418
+ if cat.startswith("C"):
419
+ return True
420
+ return False
421
+
422
+
423
+ def _is_punctuation(char):
424
+ """Checks whether `chars` is a punctuation character."""
425
+ cp = ord(char)
426
+ # We treat all non-letter/number ASCII as punctuation.
427
+ # Characters such as "^", "$", and "`" are not in the Unicode
428
+ # Punctuation class but we treat them as punctuation anyways, for
429
+ # consistency.
430
+ if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
431
+ (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
432
+ return True
433
+ cat = unicodedata.category(char)
434
+ if cat.startswith("P"):
435
+ return True
436
+ return False
tokenizations/tokenization_bert_word_level.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes."""
16
+
17
+ from __future__ import absolute_import, division, print_function, unicode_literals
18
+
19
+ import collections
20
+ import logging
21
+ import os
22
+ import unicodedata
23
+ import thulac
24
+ from io import open
25
+
26
+ from transformers.tokenization_utils import PreTrainedTokenizer
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ lac = thulac.thulac(user_dict='tokenizations/thulac_dict/seg', seg_only=True)
31
+
32
+ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
33
+
34
+ PRETRAINED_VOCAB_FILES_MAP = {
35
+ 'vocab_file':
36
+ {
37
+ 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
38
+ 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
39
+ 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
40
+ 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
41
+ 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
42
+ 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
43
+ 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
44
+ 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
45
+ 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
46
+ 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
47
+ 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
48
+ 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
49
+ 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
50
+ }
51
+ }
52
+
53
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
54
+ 'bert-base-uncased': 512,
55
+ 'bert-large-uncased': 512,
56
+ 'bert-base-cased': 512,
57
+ 'bert-large-cased': 512,
58
+ 'bert-base-multilingual-uncased': 512,
59
+ 'bert-base-multilingual-cased': 512,
60
+ 'bert-base-chinese': 512,
61
+ 'bert-base-german-cased': 512,
62
+ 'bert-large-uncased-whole-word-masking': 512,
63
+ 'bert-large-cased-whole-word-masking': 512,
64
+ 'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
65
+ 'bert-large-cased-whole-word-masking-finetuned-squad': 512,
66
+ 'bert-base-cased-finetuned-mrpc': 512,
67
+ }
68
+
69
+ def load_vocab(vocab_file):
70
+ """Loads a vocabulary file into a dictionary."""
71
+ vocab = collections.OrderedDict()
72
+ with open(vocab_file, "r", encoding="utf-8") as reader:
73
+ tokens = reader.readlines()
74
+ for index, token in enumerate(tokens):
75
+ token = token.rstrip('\n')
76
+ vocab[token] = index
77
+ return vocab
78
+
79
+
80
+ def whitespace_tokenize(text):
81
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
82
+ text = text.strip()
83
+ if not text:
84
+ return []
85
+ tokens = text.split()
86
+ return tokens
87
+
88
+
89
+ class BertTokenizer(PreTrainedTokenizer):
90
+ r"""
91
+ Constructs a BertTokenizer.
92
+ :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
93
+
94
+ Args:
95
+ vocab_file: Path to a one-wordpiece-per-line vocabulary file
96
+ do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
97
+ do_basic_tokenize: Whether to do basic tokenization before wordpiece.
98
+ max_len: An artificial maximum length to truncate tokenized_doupo sequences to; Effective maximum length is always the
99
+ minimum of this value (if specified) and the underlying BERT model's sequence length.
100
+ never_split: List of tokens which will never be split during tokenization. Only has an effect when
101
+ do_wordpiece_only=False
102
+ """
103
+
104
+ vocab_files_names = VOCAB_FILES_NAMES
105
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
106
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
107
+
108
+ def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
109
+ unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
110
+ mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
111
+ """Constructs a BertTokenizer.
112
+
113
+ Args:
114
+ **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
115
+ **do_lower_case**: (`optional`) boolean (default True)
116
+ Whether to lower case the input
117
+ Only has an effect when do_basic_tokenize=True
118
+ **do_basic_tokenize**: (`optional`) boolean (default True)
119
+ Whether to do basic tokenization before wordpiece.
120
+ **never_split**: (`optional`) list of string
121
+ List of tokens which will never be split during tokenization.
122
+ Only has an effect when do_basic_tokenize=True
123
+ **tokenize_chinese_chars**: (`optional`) boolean (default True)
124
+ Whether to tokenize Chinese characters.
125
+ This should likely be desactivated for Japanese:
126
+ see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
127
+ """
128
+ super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
129
+ pad_token=pad_token, cls_token=cls_token,
130
+ mask_token=mask_token, **kwargs)
131
+ if not os.path.isfile(vocab_file):
132
+ raise ValueError(
133
+ "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
134
+ "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
135
+ self.vocab = load_vocab(vocab_file)
136
+ self.ids_to_tokens = collections.OrderedDict(
137
+ [(ids, tok) for tok, ids in self.vocab.items()])
138
+ self.do_basic_tokenize = do_basic_tokenize
139
+ if do_basic_tokenize:
140
+ self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
141
+ never_split=never_split,
142
+ tokenize_chinese_chars=tokenize_chinese_chars)
143
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
144
+
145
+ @property
146
+ def vocab_size(self):
147
+ return len(self.vocab)
148
+
149
+ def _tokenize(self, text):
150
+ split_tokens = []
151
+ if self.do_basic_tokenize:
152
+ for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
153
+ for sub_token in self.wordpiece_tokenizer.tokenize(token):
154
+ split_tokens.append(sub_token)
155
+ else:
156
+ split_tokens = self.wordpiece_tokenizer.tokenize(text)
157
+ return split_tokens
158
+
159
+ def _convert_token_to_id(self, token):
160
+ """ Converts a token (str/unicode) in an id using the vocab. """
161
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
162
+
163
+ def _convert_id_to_token(self, index):
164
+ """Converts an index (integer) in a token (string/unicode) using the vocab."""
165
+ return self.ids_to_tokens.get(index, self.unk_token)
166
+
167
+ def convert_tokens_to_string(self, tokens):
168
+ """ Converts a sequence of tokens (string) in a single string. """
169
+ out_string = ' '.join(tokens).replace(' ##', '').strip()
170
+ return out_string
171
+
172
+ def save_vocabulary(self, vocab_path):
173
+ """Save the tokenizer vocabulary to a directory or file."""
174
+ index = 0
175
+ if os.path.isdir(vocab_path):
176
+ vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
177
+ with open(vocab_file, "w", encoding="utf-8") as writer:
178
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
179
+ if index != token_index:
180
+ logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
181
+ " Please check that the vocabulary is not corrupted!".format(vocab_file))
182
+ index = token_index
183
+ writer.write(token + u'\n')
184
+ index += 1
185
+ return (vocab_file,)
186
+
187
+ @classmethod
188
+ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
189
+ """ Instantiate a BertTokenizer from pre-trained vocabulary files.
190
+ """
191
+ if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
192
+ if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
193
+ logger.warning("The pre-trained model you are loading is a cased model but you have not set "
194
+ "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
195
+ "you may want to check this behavior.")
196
+ kwargs['do_lower_case'] = False
197
+ elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
198
+ logger.warning("The pre-trained model you are loading is an uncased model but you have set "
199
+ "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
200
+ "but you may want to check this behavior.")
201
+ kwargs['do_lower_case'] = True
202
+
203
+ return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
204
+
205
+
206
+ class BasicTokenizer(object):
207
+ """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
208
+
209
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
210
+ """ Constructs a BasicTokenizer.
211
+
212
+ Args:
213
+ **do_lower_case**: Whether to lower case the input.
214
+ **never_split**: (`optional`) list of str
215
+ Kept for backward compatibility purposes.
216
+ Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
217
+ List of token not to split.
218
+ **tokenize_chinese_chars**: (`optional`) boolean (default True)
219
+ Whether to tokenize Chinese characters.
220
+ This should likely be desactivated for Japanese:
221
+ see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
222
+ """
223
+ if never_split is None:
224
+ never_split = []
225
+ self.do_lower_case = do_lower_case
226
+ self.never_split = never_split
227
+ self.tokenize_chinese_chars = tokenize_chinese_chars
228
+
229
+ def tokenize(self, text, never_split=None):
230
+ """ Basic Tokenization of a piece of text.
231
+ Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
232
+
233
+ Args:
234
+ **never_split**: (`optional`) list of str
235
+ Kept for backward compatibility purposes.
236
+ Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
237
+ List of token not to split.
238
+ """
239
+ never_split = self.never_split + (never_split if never_split is not None else [])
240
+ text = self._clean_text(text)
241
+ # This was added on November 1st, 2018 for the multilingual and Chinese
242
+ # models. This is also applied to the English models now, but it doesn't
243
+ # matter since the English models were not trained on any Chinese data
244
+ # and generally don't have any Chinese data in them (there are Chinese
245
+ # characters in the vocabulary because Wikipedia does have some Chinese
246
+ # words in the English Wikipedia.).
247
+ if self.tokenize_chinese_chars:
248
+ text = self._tokenize_chinese_chars(text)
249
+ orig_tokens = whitespace_tokenize(text)
250
+ split_tokens = []
251
+ for token in orig_tokens:
252
+ if self.do_lower_case and token not in never_split:
253
+ token = token.lower()
254
+ token = self._run_strip_accents(token)
255
+ split_tokens.extend(self._run_split_on_punc(token))
256
+
257
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
258
+ return output_tokens
259
+
260
+ def _run_strip_accents(self, text):
261
+ """Strips accents from a piece of text."""
262
+ text = unicodedata.normalize("NFD", text)
263
+ output = []
264
+ for char in text:
265
+ cat = unicodedata.category(char)
266
+ if cat == "Mn":
267
+ continue
268
+ output.append(char)
269
+ return "".join(output)
270
+
271
+ def _run_split_on_punc(self, text, never_split=None):
272
+ """Splits punctuation on a piece of text."""
273
+ if never_split is not None and text in never_split:
274
+ return [text]
275
+ chars = list(text)
276
+ i = 0
277
+ start_new_word = True
278
+ output = []
279
+ while i < len(chars):
280
+ char = chars[i]
281
+ if _is_punctuation(char):
282
+ output.append([char])
283
+ start_new_word = True
284
+ else:
285
+ if start_new_word:
286
+ output.append([])
287
+ start_new_word = False
288
+ output[-1].append(char)
289
+ i += 1
290
+
291
+ return ["".join(x) for x in output]
292
+
293
+ # def _tokenize_chinese_chars(self, text):
294
+ # """Adds whitespace around any CJK character."""
295
+ # output = []
296
+ # for char in text:
297
+ # cp = ord(char)
298
+ # if self._is_chinese_char(cp) or char.isdigit():
299
+ # output.append(" ")
300
+ # output.append(char)
301
+ # output.append(" ")
302
+ # else:
303
+ # output.append(char)
304
+ # return "".join(output)
305
+ def _tokenize_chinese_chars(self, text):
306
+ """Adds whitespace around any CJK character."""
307
+ output = []
308
+ for char in text:
309
+ if char.isdigit():
310
+ output.append(" ")
311
+ output.append(char)
312
+ output.append(" ")
313
+ else:
314
+ output.append(char)
315
+ text = "".join(output)
316
+ text = [item[0].strip() for item in lac.cut(text)]
317
+ text = [item for item in text if item]
318
+ return " ".join(text)
319
+
320
+ def _is_chinese_char(self, cp):
321
+ """Checks whether CP is the codepoint of a CJK character."""
322
+ # This defines a "chinese character" as anything in the CJK Unicode block:
323
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
324
+ #
325
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
326
+ # despite its name. The modern Korean Hangul alphabet is a different block,
327
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
328
+ # space-separated words, so they are not treated specially and handled
329
+ # like the all of the other languages.
330
+ if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
331
+ (cp >= 0x3400 and cp <= 0x4DBF) or #
332
+ (cp >= 0x20000 and cp <= 0x2A6DF) or #
333
+ (cp >= 0x2A700 and cp <= 0x2B73F) or #
334
+ (cp >= 0x2B740 and cp <= 0x2B81F) or #
335
+ (cp >= 0x2B820 and cp <= 0x2CEAF) or
336
+ (cp >= 0xF900 and cp <= 0xFAFF) or #
337
+ (cp >= 0x2F800 and cp <= 0x2FA1F)): #
338
+ return True
339
+
340
+ return False
341
+
342
+ def _clean_text(self, text):
343
+ """Performs invalid character removal and whitespace cleanup on text."""
344
+ output = []
345
+ for char in text:
346
+ cp = ord(char)
347
+ if cp == 0 or cp == 0xfffd or _is_control(char):
348
+ continue
349
+ if _is_whitespace(char):
350
+ output.append(" ")
351
+ else:
352
+ output.append(char)
353
+ return "".join(output)
354
+
355
+
356
+ class WordpieceTokenizer(object):
357
+ """Runs WordPiece tokenization."""
358
+
359
+ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
360
+ self.vocab = vocab
361
+ self.unk_token = unk_token
362
+ self.max_input_chars_per_word = max_input_chars_per_word
363
+
364
+ def tokenize(self, text):
365
+ """Tokenizes a piece of text into its word pieces.
366
+
367
+ This uses a greedy longest-match-first algorithm to perform tokenization
368
+ using the given vocabulary.
369
+
370
+ For example:
371
+ input = "unaffable"
372
+ output = ["un", "##aff", "##able"]
373
+
374
+ Args:
375
+ text: A single token or whitespace separated tokens. This should have
376
+ already been passed through `BasicTokenizer`.
377
+
378
+ Returns:
379
+ A list of wordpiece tokens.
380
+ """
381
+
382
+ output_tokens = []
383
+ for token in whitespace_tokenize(text):
384
+ chars = list(token)
385
+ if len(chars) > self.max_input_chars_per_word:
386
+ output_tokens.append(self.unk_token)
387
+ continue
388
+
389
+ is_bad = False
390
+ start = 0
391
+ sub_tokens = []
392
+ while start < len(chars):
393
+ end = len(chars)
394
+ cur_substr = None
395
+ while start < end:
396
+ substr = "".join(chars[start:end])
397
+ if start > 0:
398
+ substr = "##" + substr
399
+ if substr in self.vocab:
400
+ cur_substr = substr
401
+ break
402
+ end -= 1
403
+ if cur_substr is None:
404
+ is_bad = True
405
+ break
406
+ sub_tokens.append(cur_substr)
407
+ start = end
408
+
409
+ if is_bad:
410
+ output_tokens.append(self.unk_token)
411
+ else:
412
+ output_tokens.extend(sub_tokens)
413
+ return output_tokens
414
+
415
+
416
+ def _is_whitespace(char):
417
+ """Checks whether `chars` is a whitespace character."""
418
+ # \t, \n, and \r are technically contorl characters but we treat them
419
+ # as whitespace since they are generally considered as such.
420
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
421
+ return True
422
+ cat = unicodedata.category(char)
423
+ if cat == "Zs":
424
+ return True
425
+ return False
426
+
427
+
428
+ def _is_control(char):
429
+ """Checks whether `chars` is a control character."""
430
+ # These are technically control characters but we count them as whitespace
431
+ # characters.
432
+ if char == "\t" or char == "\n" or char == "\r":
433
+ return False
434
+ cat = unicodedata.category(char)
435
+ if cat.startswith("C"):
436
+ return True
437
+ return False
438
+
439
+
440
+ def _is_punctuation(char):
441
+ """Checks whether `chars` is a punctuation character."""
442
+ cp = ord(char)
443
+ # We treat all non-letter/number ASCII as punctuation.
444
+ # Characters such as "^", "$", and "`" are not in the Unicode
445
+ # Punctuation class but we treat them as punctuation anyways, for
446
+ # consistency.
447
+ if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
448
+ (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
449
+ return True
450
+ cat = unicodedata.category(char)
451
+ if cat.startswith("P"):
452
+ return True
453
+ return False