singletongue
commited on
Commit
·
4117b7f
1
Parent(s):
f01bdac
Add model files
Browse files- .gitattributes +1 -0
- added_tokens.json +4 -0
- config.json +32 -0
- entity_vocab.json +3 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +23 -0
- tokenization_luke_bert_japanese.py +420 -0
- tokenizer_config.json +37 -0
- vocab.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
entity_vocab.json filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<ent2>": 32769,
|
3 |
+
"<ent>": 32768
|
4 |
+
}
|
config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "cl-tohoku/bert-base-japanese-v3",
|
3 |
+
"architectures": [
|
4 |
+
"LukeForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bert_model_name": "cl-tohoku/bert-base-japanese-v3",
|
8 |
+
"bos_token_id": null,
|
9 |
+
"classifier_dropout": null,
|
10 |
+
"cls_entity_prediction": false,
|
11 |
+
"entity_emb_size": 256,
|
12 |
+
"entity_vocab_size": 591699,
|
13 |
+
"eos_token_id": null,
|
14 |
+
"hidden_act": "gelu",
|
15 |
+
"hidden_dropout_prob": 0.1,
|
16 |
+
"hidden_size": 768,
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"intermediate_size": 3072,
|
19 |
+
"layer_norm_eps": 1e-12,
|
20 |
+
"max_position_embeddings": 512,
|
21 |
+
"model_type": "luke",
|
22 |
+
"num_attention_heads": 12,
|
23 |
+
"num_hidden_layers": 12,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"position_embedding_type": "absolute",
|
26 |
+
"torch_dtype": "float32",
|
27 |
+
"transformers_version": "4.30.2",
|
28 |
+
"type_vocab_size": 2,
|
29 |
+
"use_cache": true,
|
30 |
+
"use_entity_aware_attention": true,
|
31 |
+
"vocab_size": 32770
|
32 |
+
}
|
entity_vocab.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be6327e7cafc2f2b5f694a594d57113fd2bf6b620c592929202f75683b18b67d
|
3 |
+
size 23721849
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12fc608cd4f1662905c6e025fea20ca90f8494fa93a5c1f7c825ed41220ef2e7
|
3 |
+
size 1143901513
|
special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
{
|
4 |
+
"content": "<ent>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": true,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"content": "<ent2>",
|
12 |
+
"lstrip": false,
|
13 |
+
"normalized": true,
|
14 |
+
"rstrip": false,
|
15 |
+
"single_word": false
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"cls_token": "[CLS]",
|
19 |
+
"mask_token": "[MASK]",
|
20 |
+
"pad_token": "[PAD]",
|
21 |
+
"sep_token": "[SEP]",
|
22 |
+
"unk_token": "[UNK]"
|
23 |
+
}
|
tokenization_luke_bert_japanese.py
ADDED
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright Studio-Ouisa and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""Tokenization classes for LUKE."""
|
16 |
+
|
17 |
+
import collections
|
18 |
+
import copy
|
19 |
+
import json
|
20 |
+
import os
|
21 |
+
from typing import List, Optional, Tuple
|
22 |
+
|
23 |
+
from transformers.models.bert_japanese.tokenization_bert_japanese import (
|
24 |
+
BasicTokenizer,
|
25 |
+
CharacterTokenizer,
|
26 |
+
JumanppTokenizer,
|
27 |
+
MecabTokenizer,
|
28 |
+
SentencepieceTokenizer,
|
29 |
+
SudachiTokenizer,
|
30 |
+
WordpieceTokenizer,
|
31 |
+
load_vocab,
|
32 |
+
)
|
33 |
+
from transformers.models.luke import LukeTokenizer
|
34 |
+
from transformers.tokenization_utils_base import AddedToken
|
35 |
+
from transformers.utils import logging
|
36 |
+
|
37 |
+
|
38 |
+
logger = logging.get_logger(__name__)
|
39 |
+
|
40 |
+
EntitySpan = Tuple[int, int]
|
41 |
+
EntitySpanInput = List[EntitySpan]
|
42 |
+
Entity = str
|
43 |
+
EntityInput = List[Entity]
|
44 |
+
|
45 |
+
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "entity_vocab_file": "entity_vocab.json"}
|
46 |
+
|
47 |
+
PRETRAINED_VOCAB_FILES_MAP = {"vocab_file": {}, "entity_vocab_file": {}}
|
48 |
+
|
49 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
50 |
+
|
51 |
+
|
52 |
+
class LukeBertJapaneseTokenizer(LukeTokenizer):
|
53 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
54 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
55 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
56 |
+
model_input_names = ["input_ids", "attention_mask"]
|
57 |
+
|
58 |
+
def __init__(
|
59 |
+
self,
|
60 |
+
vocab_file,
|
61 |
+
entity_vocab_file,
|
62 |
+
spm_file=None,
|
63 |
+
task=None,
|
64 |
+
max_entity_length=32,
|
65 |
+
max_mention_length=30,
|
66 |
+
entity_token_1="<ent>",
|
67 |
+
entity_token_2="<ent2>",
|
68 |
+
entity_unk_token="[UNK]",
|
69 |
+
entity_pad_token="[PAD]",
|
70 |
+
entity_mask_token="[MASK]",
|
71 |
+
entity_mask2_token="[MASK2]",
|
72 |
+
do_lower_case=False,
|
73 |
+
do_word_tokenize=True,
|
74 |
+
do_subword_tokenize=True,
|
75 |
+
word_tokenizer_type="basic",
|
76 |
+
subword_tokenizer_type="wordpiece",
|
77 |
+
never_split=None,
|
78 |
+
unk_token="[UNK]",
|
79 |
+
sep_token="[SEP]",
|
80 |
+
pad_token="[PAD]",
|
81 |
+
cls_token="[CLS]",
|
82 |
+
mask_token="[MASK]",
|
83 |
+
mecab_kwargs=None,
|
84 |
+
sudachi_kwargs=None,
|
85 |
+
jumanpp_kwargs=None,
|
86 |
+
**kwargs,
|
87 |
+
):
|
88 |
+
# We call the grandparent's init, not the parent's.
|
89 |
+
super(LukeTokenizer, self).__init__(
|
90 |
+
spm_file=spm_file,
|
91 |
+
unk_token=unk_token,
|
92 |
+
sep_token=sep_token,
|
93 |
+
pad_token=pad_token,
|
94 |
+
cls_token=cls_token,
|
95 |
+
mask_token=mask_token,
|
96 |
+
do_lower_case=do_lower_case,
|
97 |
+
do_word_tokenize=do_word_tokenize,
|
98 |
+
do_subword_tokenize=do_subword_tokenize,
|
99 |
+
word_tokenizer_type=word_tokenizer_type,
|
100 |
+
subword_tokenizer_type=subword_tokenizer_type,
|
101 |
+
never_split=never_split,
|
102 |
+
mecab_kwargs=mecab_kwargs,
|
103 |
+
sudachi_kwargs=sudachi_kwargs,
|
104 |
+
jumanpp_kwargs=jumanpp_kwargs,
|
105 |
+
task=task,
|
106 |
+
max_entity_length=32,
|
107 |
+
max_mention_length=30,
|
108 |
+
entity_token_1="<ent>",
|
109 |
+
entity_token_2="<ent2>",
|
110 |
+
entity_unk_token=entity_unk_token,
|
111 |
+
entity_pad_token=entity_pad_token,
|
112 |
+
entity_mask_token=entity_mask_token,
|
113 |
+
entity_mask2_token=entity_mask2_token,
|
114 |
+
**kwargs,
|
115 |
+
)
|
116 |
+
|
117 |
+
if subword_tokenizer_type == "sentencepiece":
|
118 |
+
if not os.path.isfile(spm_file):
|
119 |
+
raise ValueError(
|
120 |
+
f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google"
|
121 |
+
" pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
122 |
+
)
|
123 |
+
self.spm_file = spm_file
|
124 |
+
else:
|
125 |
+
if not os.path.isfile(vocab_file):
|
126 |
+
raise ValueError(
|
127 |
+
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google"
|
128 |
+
" pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
129 |
+
)
|
130 |
+
self.vocab = load_vocab(vocab_file)
|
131 |
+
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
132 |
+
|
133 |
+
self.do_word_tokenize = do_word_tokenize
|
134 |
+
self.word_tokenizer_type = word_tokenizer_type
|
135 |
+
self.lower_case = do_lower_case
|
136 |
+
self.never_split = never_split
|
137 |
+
self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
|
138 |
+
self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
|
139 |
+
self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
|
140 |
+
if do_word_tokenize:
|
141 |
+
if word_tokenizer_type == "basic":
|
142 |
+
self.word_tokenizer = BasicTokenizer(
|
143 |
+
do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
|
144 |
+
)
|
145 |
+
elif word_tokenizer_type == "mecab":
|
146 |
+
self.word_tokenizer = MecabTokenizer(
|
147 |
+
do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
|
148 |
+
)
|
149 |
+
elif word_tokenizer_type == "sudachi":
|
150 |
+
self.word_tokenizer = SudachiTokenizer(
|
151 |
+
do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
|
152 |
+
)
|
153 |
+
elif word_tokenizer_type == "jumanpp":
|
154 |
+
self.word_tokenizer = JumanppTokenizer(
|
155 |
+
do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
|
156 |
+
)
|
157 |
+
else:
|
158 |
+
raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
|
159 |
+
|
160 |
+
self.do_subword_tokenize = do_subword_tokenize
|
161 |
+
self.subword_tokenizer_type = subword_tokenizer_type
|
162 |
+
if do_subword_tokenize:
|
163 |
+
if subword_tokenizer_type == "wordpiece":
|
164 |
+
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
165 |
+
elif subword_tokenizer_type == "character":
|
166 |
+
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
167 |
+
elif subword_tokenizer_type == "sentencepiece":
|
168 |
+
self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
|
169 |
+
else:
|
170 |
+
raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
|
171 |
+
|
172 |
+
# we add 2 special tokens for downstream tasks
|
173 |
+
# for more information about lstrip and rstrip, see https://github.com/huggingface/transformers/pull/2778
|
174 |
+
entity_token_1 = (
|
175 |
+
AddedToken(entity_token_1, lstrip=False, rstrip=False)
|
176 |
+
if isinstance(entity_token_1, str)
|
177 |
+
else entity_token_1
|
178 |
+
)
|
179 |
+
entity_token_2 = (
|
180 |
+
AddedToken(entity_token_2, lstrip=False, rstrip=False)
|
181 |
+
if isinstance(entity_token_2, str)
|
182 |
+
else entity_token_2
|
183 |
+
)
|
184 |
+
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
185 |
+
kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
|
186 |
+
|
187 |
+
with open(entity_vocab_file, encoding="utf-8") as entity_vocab_handle:
|
188 |
+
self.entity_vocab = json.load(entity_vocab_handle)
|
189 |
+
for entity_special_token in [entity_unk_token, entity_pad_token, entity_mask_token, entity_mask2_token]:
|
190 |
+
if entity_special_token not in self.entity_vocab:
|
191 |
+
raise ValueError(
|
192 |
+
f"Specified entity special token ``{entity_special_token}`` is not found in entity_vocab. "
|
193 |
+
f"Probably an incorrect entity vocab file is loaded: {entity_vocab_file}."
|
194 |
+
)
|
195 |
+
self.entity_unk_token_id = self.entity_vocab[entity_unk_token]
|
196 |
+
self.entity_pad_token_id = self.entity_vocab[entity_pad_token]
|
197 |
+
self.entity_mask_token_id = self.entity_vocab[entity_mask_token]
|
198 |
+
self.entity_mask2_token_id = self.entity_vocab[entity_mask2_token]
|
199 |
+
|
200 |
+
self.task = task
|
201 |
+
if task is None or task == "entity_span_classification":
|
202 |
+
self.max_entity_length = max_entity_length
|
203 |
+
elif task == "entity_classification":
|
204 |
+
self.max_entity_length = 1
|
205 |
+
elif task == "entity_pair_classification":
|
206 |
+
self.max_entity_length = 2
|
207 |
+
else:
|
208 |
+
raise ValueError(
|
209 |
+
f"Task {task} not supported. Select task from ['entity_classification', 'entity_pair_classification',"
|
210 |
+
" 'entity_span_classification'] only."
|
211 |
+
)
|
212 |
+
|
213 |
+
self.max_mention_length = max_mention_length
|
214 |
+
|
215 |
+
@property
|
216 |
+
# Copied from BertJapaneseTokenizer
|
217 |
+
def do_lower_case(self):
|
218 |
+
return self.lower_case
|
219 |
+
|
220 |
+
# Copied from BertJapaneseTokenizer
|
221 |
+
def __getstate__(self):
|
222 |
+
state = dict(self.__dict__)
|
223 |
+
if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
|
224 |
+
del state["word_tokenizer"]
|
225 |
+
return state
|
226 |
+
|
227 |
+
# Copied from BertJapaneseTokenizer
|
228 |
+
def __setstate__(self, state):
|
229 |
+
self.__dict__ = state
|
230 |
+
if self.word_tokenizer_type == "mecab":
|
231 |
+
self.word_tokenizer = MecabTokenizer(
|
232 |
+
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
|
233 |
+
)
|
234 |
+
elif self.word_tokenizer_type == "sudachi":
|
235 |
+
self.word_tokenizer = SudachiTokenizer(
|
236 |
+
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
|
237 |
+
)
|
238 |
+
elif self.word_tokenizer_type == "jumanpp":
|
239 |
+
self.word_tokenizer = JumanppTokenizer(
|
240 |
+
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
|
241 |
+
)
|
242 |
+
|
243 |
+
# Copied from BertJapaneseTokenizer
|
244 |
+
def _tokenize(self, text):
|
245 |
+
if self.do_word_tokenize:
|
246 |
+
tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
|
247 |
+
else:
|
248 |
+
tokens = [text]
|
249 |
+
|
250 |
+
if self.do_subword_tokenize:
|
251 |
+
split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
|
252 |
+
else:
|
253 |
+
split_tokens = tokens
|
254 |
+
|
255 |
+
return split_tokens
|
256 |
+
|
257 |
+
@property
|
258 |
+
# Copied from BertJapaneseTokenizer
|
259 |
+
def vocab_size(self):
|
260 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
261 |
+
return len(self.subword_tokenizer.sp_model)
|
262 |
+
return len(self.vocab)
|
263 |
+
|
264 |
+
# Copied from BertJapaneseTokenizer
|
265 |
+
def get_vocab(self):
|
266 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
267 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
268 |
+
vocab.update(self.added_tokens_encoder)
|
269 |
+
return vocab
|
270 |
+
return dict(self.vocab, **self.added_tokens_encoder)
|
271 |
+
|
272 |
+
# Copied from BertJapaneseTokenizer
|
273 |
+
def _convert_token_to_id(self, token):
|
274 |
+
"""Converts a token (str) in an id using the vocab."""
|
275 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
276 |
+
return self.subword_tokenizer.sp_model.PieceToId(token)
|
277 |
+
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
278 |
+
|
279 |
+
# Copied from BertJapaneseTokenizer
|
280 |
+
def _convert_id_to_token(self, index):
|
281 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
282 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
283 |
+
return self.subword_tokenizer.sp_model.IdToPiece(index)
|
284 |
+
return self.ids_to_tokens.get(index, self.unk_token)
|
285 |
+
|
286 |
+
# Copied from BertJapaneseTokenizer
|
287 |
+
def convert_tokens_to_string(self, tokens):
|
288 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
289 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
290 |
+
return self.subword_tokenizer.sp_model.decode(tokens)
|
291 |
+
out_string = " ".join(tokens).replace(" ##", "").strip()
|
292 |
+
return out_string
|
293 |
+
|
294 |
+
# Copied from BertJapaneseTokenizer
|
295 |
+
def build_inputs_with_special_tokens(
|
296 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
297 |
+
) -> List[int]:
|
298 |
+
"""
|
299 |
+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
300 |
+
adding special tokens. A BERT sequence has the following format:
|
301 |
+
|
302 |
+
- single sequence: `[CLS] X [SEP]`
|
303 |
+
- pair of sequences: `[CLS] A [SEP] B [SEP]`
|
304 |
+
|
305 |
+
Args:
|
306 |
+
token_ids_0 (`List[int]`):
|
307 |
+
List of IDs to which the special tokens will be added.
|
308 |
+
token_ids_1 (`List[int]`, *optional*):
|
309 |
+
Optional second list of IDs for sequence pairs.
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
313 |
+
"""
|
314 |
+
if token_ids_1 is None:
|
315 |
+
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
316 |
+
cls = [self.cls_token_id]
|
317 |
+
sep = [self.sep_token_id]
|
318 |
+
return cls + token_ids_0 + sep + token_ids_1 + sep
|
319 |
+
|
320 |
+
# Copied from BertJapaneseTokenizer
|
321 |
+
def get_special_tokens_mask(
|
322 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
323 |
+
) -> List[int]:
|
324 |
+
"""
|
325 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
326 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
327 |
+
|
328 |
+
Args:
|
329 |
+
token_ids_0 (`List[int]`):
|
330 |
+
List of IDs.
|
331 |
+
token_ids_1 (`List[int]`, *optional*):
|
332 |
+
Optional second list of IDs for sequence pairs.
|
333 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
334 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
335 |
+
|
336 |
+
Returns:
|
337 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
338 |
+
"""
|
339 |
+
|
340 |
+
if already_has_special_tokens:
|
341 |
+
return super().get_special_tokens_mask(
|
342 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
343 |
+
)
|
344 |
+
|
345 |
+
if token_ids_1 is not None:
|
346 |
+
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
347 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
348 |
+
|
349 |
+
# Copied from BertJapaneseTokenizer
|
350 |
+
def create_token_type_ids_from_sequences(
|
351 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
352 |
+
) -> List[int]:
|
353 |
+
"""
|
354 |
+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
355 |
+
pair mask has the following format:
|
356 |
+
|
357 |
+
```
|
358 |
+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
359 |
+
| first sequence | second sequence |
|
360 |
+
```
|
361 |
+
|
362 |
+
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
363 |
+
|
364 |
+
Args:
|
365 |
+
token_ids_0 (`List[int]`):
|
366 |
+
List of IDs.
|
367 |
+
token_ids_1 (`List[int]`, *optional*):
|
368 |
+
Optional second list of IDs for sequence pairs.
|
369 |
+
|
370 |
+
Returns:
|
371 |
+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
372 |
+
"""
|
373 |
+
sep = [self.sep_token_id]
|
374 |
+
cls = [self.cls_token_id]
|
375 |
+
if token_ids_1 is None:
|
376 |
+
return len(cls + token_ids_0 + sep) * [0]
|
377 |
+
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
378 |
+
|
379 |
+
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
380 |
+
return (text, kwargs)
|
381 |
+
|
382 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
383 |
+
if os.path.isdir(save_directory):
|
384 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
385 |
+
vocab_file = os.path.join(
|
386 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"]
|
387 |
+
)
|
388 |
+
else:
|
389 |
+
vocab_file = os.path.join(
|
390 |
+
save_directory,
|
391 |
+
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
|
392 |
+
)
|
393 |
+
else:
|
394 |
+
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
|
395 |
+
|
396 |
+
if self.subword_tokenizer_type == "sentencepiece":
|
397 |
+
with open(vocab_file, "wb") as writer:
|
398 |
+
content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto()
|
399 |
+
writer.write(content_spiece_model)
|
400 |
+
else:
|
401 |
+
with open(vocab_file, "w", encoding="utf-8") as writer:
|
402 |
+
index = 0
|
403 |
+
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
404 |
+
if index != token_index:
|
405 |
+
logger.warning(
|
406 |
+
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
|
407 |
+
" Please check that the vocabulary is not corrupted!"
|
408 |
+
)
|
409 |
+
index = token_index
|
410 |
+
writer.write(token + "\n")
|
411 |
+
index += 1
|
412 |
+
|
413 |
+
entity_vocab_file = os.path.join(
|
414 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
|
415 |
+
)
|
416 |
+
|
417 |
+
with open(entity_vocab_file, "w", encoding="utf-8") as f:
|
418 |
+
f.write(json.dumps(self.entity_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
|
419 |
+
|
420 |
+
return vocab_file, entity_vocab_file
|
tokenizer_config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoTokenizer": [
|
4 |
+
"tokenization_luke_bert_japanese.LukeBertJapaneseTokenizer",
|
5 |
+
null
|
6 |
+
]
|
7 |
+
},
|
8 |
+
"clean_up_tokenization_spaces": true,
|
9 |
+
"cls_token": "[CLS]",
|
10 |
+
"do_lower_case": false,
|
11 |
+
"do_subword_tokenize": true,
|
12 |
+
"do_word_tokenize": true,
|
13 |
+
"entity_mask2_token": "[MASK2]",
|
14 |
+
"entity_mask_token": "[MASK]",
|
15 |
+
"entity_pad_token": "[PAD]",
|
16 |
+
"entity_token_1": "<ent>",
|
17 |
+
"entity_token_2": "<ent2>",
|
18 |
+
"entity_unk_token": "[UNK]",
|
19 |
+
"jumanpp_kwargs": null,
|
20 |
+
"mask_token": "[MASK]",
|
21 |
+
"max_entity_length": 32,
|
22 |
+
"max_mention_length": 30,
|
23 |
+
"mecab_kwargs": {
|
24 |
+
"mecab_dic": "unidic_lite"
|
25 |
+
},
|
26 |
+
"model_max_length": 512,
|
27 |
+
"never_split": null,
|
28 |
+
"pad_token": "[PAD]",
|
29 |
+
"sep_token": "[SEP]",
|
30 |
+
"spm_file": null,
|
31 |
+
"subword_tokenizer_type": "wordpiece",
|
32 |
+
"sudachi_kwargs": null,
|
33 |
+
"task": null,
|
34 |
+
"tokenizer_class": "LukeBertJapaneseTokenizer",
|
35 |
+
"unk_token": "[UNK]",
|
36 |
+
"word_tokenizer_type": "mecab"
|
37 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|