Initial commit

Browse files

Files changed (8) hide show

.gitattributes +34 -0
LICENSE +202 -0
README.md +64 -0
config.json +23 -0
distilbert_japanese_tokenizer.py +1012 -0
pytorch_model.bin +3 -0
spiece.model +3 -0
tokenizer_config.json +32 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2021 LINE Corporation
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+language: ja
+license: apache-2.0
+mask_token: "[MASK]"
+widget:
+- text: "LINE株式会社で[MASK]の研究・開発をしている。"
+---
+# LINE DistilBERT Japanese
+This is a DistilBERT model pre-trained on 131 GB of Japanese web text.
+The teacher model is BERT-base that built in-house at LINE.
+The model was trained by [LINE Corporation](https://linecorp.com/).
+## For Japanese
+https://github.com/line/LINE-DistilBERT-Japanese/README_ja.md is written in Japanese.
+## How to use
+```python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("line-corporation/line-distilbert-base-japanese", trust_remote_code=True)
+model = AutoModel.from_pretrained("line-corporation/line-distilbert-base-japanese")
+sentence = "LINE株式会社で[MASK]の研究・開発をしている。"
+print(model(**tokenizer(sentence, return_tensors="pt")))
+```
+## Model architecture
+The model architecture is the DitilBERT base model; 6 layers, 768 dimensions of hidden states, 12 attention heads, 66M parameters.
+## Evaluation
+The evaluation by [JGLUE](https://github.com/yahoojapan/JGLUE) is as follows:
+| model name             | #Params | Marc_ja | JNLI |       JSTS       |  JSQuAD   | JCommonSenseQA |
+|------------------------|:-------:|:-------:|:----:|:----------------:|:---------:|:--------------:|
+|                        |         |   acc   | acc  | Pearson/Spearman |   EM/F1   |      acc       |
+| LINE-DistilBERT        |   68M   |  95.6   | 88.9 |    89.2/85.1     | 87.3/93.3 |      76.1      |
+| Laboro-DistilBERT      |   68M   |  94.7   | 82.0 |    87.4/82.7     | 70.2/87.3 |      73.2      |
+| BandaiNamco-DistilBERT |   68M   |  94.6   | 81.6 |    86.8/82.1     | 80.0/88.0 |      66.5      |
+## Tokenization
+The texts are first tokenized by MeCab with the Unidic dictionary and then split into subwords by the SentencePiece algorithm. The vocabulary size is 32768.
+## Licenses
+The pretrained models are distributed under the terms of the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
+## To cite this work
+We haven't published any paper on this work. Please cite [this GitHub repository](http://github.com/line/LINE-DistilBERT-Japanese):
+```
+@article{LINE DistilBERT Japanese,
+  title = {LINE DistilBERT Japanese},
+  author = {"Koga, Kobayashi and Li, Shengzhe and Nakamachi, Akifumi and Sato, Toshinori"},
+  year = {2023},
+  howpublished = {\url{http://github.com/line/LINE-DistilBERT-Japanese}}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForMaskedLM"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_hidden_states": true,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": true,
+  "tie_weights_": true,
+  "transformers_version": "4.18.0",
+  "vocab_size": 32768
+}

distilbert_japanese_tokenizer.py ADDED Viewed

	@@ -0,0 +1,1012 @@

+# coding=utf-8
+# Copyright 2023 LINE Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Almost copied from [transformers.BertJapaneseTokenizer](https://github.com/huggingface/transformers/blob/v4.26.1/src/transformers/models/bert_japanese/tokenization_bert_japanese.py#)
+# This code is distributed under the Apache License 2.0.
+"""Tokenization classes."""
+import collections
+import copy
+import os
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from transformers.utils import is_sentencepiece_available, logging
+if is_sentencepiece_available():
+    import sentencepiece as spm
+else:
+    spm = None
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "spm_file": "spiece.model"}
+SPIECE_UNDERLINE = "▁"
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/vocab.txt",
+        "cl-tohoku/bert-base-japanese-whole-word-masking": (
+            "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/vocab.txt"
+        ),
+        "cl-tohoku/bert-base-japanese-char": (
+            "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/vocab.txt"
+        ),
+        "cl-tohoku/bert-base-japanese-char-whole-word-masking": (
+            "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/vocab.txt"
+        ),
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "cl-tohoku/bert-base-japanese": 512,
+    "cl-tohoku/bert-base-japanese-whole-word-masking": 512,
+    "cl-tohoku/bert-base-japanese-char": 512,
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": 512,
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    "cl-tohoku/bert-base-japanese": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "wordpiece",
+    },
+    "cl-tohoku/bert-base-japanese-whole-word-masking": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "wordpiece",
+    },
+    "cl-tohoku/bert-base-japanese-char": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "character",
+    },
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "character",
+    },
+}
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a BERT tokenizer for Japanese text.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
+    to: this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to a one-wordpiece-per-line vocabulary file.
+        spm_file (`str`, *optional*):
+            Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm or .model
+            extension) that contains the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
+        do_word_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether to do word tokenization.
+        do_subword_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether to do subword tokenization.
+        word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
+            Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"].
+        subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
+            Type of subword tokenizer. Choose from ["wordpiece", "character", "sentencepiece",].
+        mecab_kwargs (`dict`, *optional*):
+            Dictionary passed to the `MecabTokenizer` constructor.
+        sudachi_kwargs (`dict`, *optional*):
+            Dictionary passed to the `SudachiTokenizer` constructor.
+        jumanpp_kwargs (`dict`, *optional*):
+            Dictionary passed to the `JumanppTokenizer` constructor.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names  = [ "input_ids" , "attention_mask" ]
+    def __init__(
+        self,
+        vocab_file,
+        spm_file=None,
+        do_lower_case=False,
+        do_word_tokenize=True,
+        do_subword_tokenize=True,
+        word_tokenizer_type="basic",
+        subword_tokenizer_type="wordpiece",
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        mecab_kwargs=None,
+        sudachi_kwargs=None,
+        jumanpp_kwargs=None,
+        **kwargs
+    ):
+        super().__init__(
+            spm_file=spm_file,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            do_lower_case=do_lower_case,
+            do_word_tokenize=do_word_tokenize,
+            do_subword_tokenize=do_subword_tokenize,
+            word_tokenizer_type=word_tokenizer_type,
+            subword_tokenizer_type=subword_tokenizer_type,
+            never_split=never_split,
+            mecab_kwargs=mecab_kwargs,
+            sudachi_kwargs=sudachi_kwargs,
+            jumanpp_kwargs=jumanpp_kwargs,
+            **kwargs,
+        )
+        if subword_tokenizer_type == "sentencepiece":
+            if not os.path.isfile(spm_file):
+                raise ValueError(
+                    f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google"
+                    " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                )
+            self.spm_file = spm_file
+        else:
+            if not os.path.isfile(vocab_file):
+                raise ValueError(
+                    f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google"
+                    " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                )
+            self.vocab = load_vocab(vocab_file)
+            self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_word_tokenize = do_word_tokenize
+        self.word_tokenizer_type = word_tokenizer_type
+        self.lower_case = do_lower_case
+        self.never_split = never_split
+        self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
+        self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
+        self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
+        if do_word_tokenize:
+            if word_tokenizer_type == "basic":
+                self.word_tokenizer = BasicTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
+                )
+            elif word_tokenizer_type == "mecab":
+                self.word_tokenizer = MecabTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
+                )
+            elif word_tokenizer_type == "sudachi":
+                self.word_tokenizer = SudachiTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
+                )
+            elif word_tokenizer_type == "jumanpp":
+                self.word_tokenizer = JumanppTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
+                )
+            else:
+                raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
+        self.do_subword_tokenize = do_subword_tokenize
+        self.subword_tokenizer_type = subword_tokenizer_type
+        if do_subword_tokenize:
+            if subword_tokenizer_type == "wordpiece":
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+            elif subword_tokenizer_type == "character":
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+            elif subword_tokenizer_type == "sentencepiece":
+                self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
+            else:
+                raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
+    @property
+    def do_lower_case(self):
+        return self.lower_case
+    def __getstate__(self):
+        state = dict(self.__dict__)
+        if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
+            del state["word_tokenizer"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__ = state
+        if self.word_tokenizer_type == "mecab":
+            self.word_tokenizer = MecabTokenizer(
+                do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
+            )
+        elif self.word_tokenizer_type == "sudachi":
+            self.word_tokenizer = SudachiTokenizer(
+                do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
+            )
+        elif self.word_tokenizer_type == "jumanpp":
+            self.word_tokenizer = JumanppTokenizer(
+                do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
+            )
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+        return split_tokens
+    @property
+    def vocab_size(self):
+        if self.subword_tokenizer_type == "sentencepiece":
+            return len(self.subword_tokenizer.sp_model)
+        return len(self.vocab)
+    def get_vocab(self):
+        if self.subword_tokenizer_type == "sentencepiece":
+            vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+            vocab.update(self.added_tokens_encoder)
+            return vocab
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if self.subword_tokenizer_type == "sentencepiece":
+            return self.subword_tokenizer.sp_model.PieceToId(token)
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if self.subword_tokenizer_type == "sentencepiece":
+            return self.subword_tokenizer.sp_model.IdToPiece(index)
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        if self.subword_tokenizer_type == "sentencepiece":
+            return self.subword_tokenizer.sp_model.decode(tokens)
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if os.path.isdir(save_directory):
+            if self.subword_tokenizer_type == "sentencepiece":
+                vocab_file = os.path.join(
+                    save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"]
+                )
+            else:
+                vocab_file = os.path.join(
+                    save_directory,
+                    (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
+                )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        if self.subword_tokenizer_type == "sentencepiece":
+            with open(vocab_file, "wb") as writer:
+                content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto()
+                writer.write(content_spiece_model)
+        else:
+            with open(vocab_file, "w", encoding="utf-8") as writer:
+                index = 0
+                for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                    if index != token_index:
+                        logger.warning(
+                            f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                            " Please check that the vocabulary is not corrupted!"
+                        )
+                        index = token_index
+                    writer.write(token + "\n")
+                    index += 1
+        return (vocab_file,)
+class MecabTokenizer:
+    """Runs basic tokenization with MeCab morphological parser."""
+    def __init__(
+        self,
+        do_lower_case=False,
+        never_split=None,
+        normalize_text=True,
+        mecab_dic: Optional[str] = "ipadic",
+        mecab_option: Optional[str] = None,
+    ):
+        """
+        Constructs a MecabTokenizer.
+        Args:
+            **do_lower_case**: (*optional*) boolean (default True)
+                Whether to lowercase the input.
+            **never_split**: (*optional*) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
+            **normalize_text**: (*optional*) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+            **mecab_dic**: (*optional*) string (default "ipadic")
+                Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
+                set this option to `None` and modify *mecab_option*.
+            **mecab_option**: (*optional*) string
+                String passed to MeCab constructor.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+        try:
+            import fugashi
+        except ModuleNotFoundError as error:
+            raise error.__class__(
+                "You need to install fugashi to use MecabTokenizer. "
+                "See https://pypi.org/project/fugashi/ for installation."
+            )
+        mecab_option = mecab_option or ""
+        if mecab_dic is not None:
+            if mecab_dic == "ipadic":
+                try:
+                    import ipadic
+                except ModuleNotFoundError as error:
+                    raise error.__class__(
+                        "The ipadic dictionary is not installed. "
+                        "See https://github.com/polm/ipadic-py for installation."
+                    )
+                dic_dir = ipadic.DICDIR
+            elif mecab_dic == "unidic_lite":
+                try:
+                    import unidic_lite
+                except ModuleNotFoundError as error:
+                    raise error.__class__(
+                        "The unidic_lite dictionary is not installed. "
+                        "See https://github.com/polm/unidic-lite for installation."
+                    )
+                dic_dir = unidic_lite.DICDIR
+            elif mecab_dic == "unidic":
+                try:
+                    import unidic
+                except ModuleNotFoundError as error:
+                    raise error.__class__(
+                        "The unidic dictionary is not installed. "
+                        "See https://github.com/polm/unidic-py for installation."
+                    )
+                dic_dir = unidic.DICDIR
+                if not os.path.isdir(dic_dir):
+                    raise RuntimeError(
+                        "The unidic dictionary itself is not found. "
+                        "See https://github.com/polm/unidic-py for installation."
+                    )
+            else:
+                raise ValueError("Invalid mecab_dic is specified.")
+            mecabrc = os.path.join(dic_dir, "mecabrc")
+            mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option
+        self.mecab = fugashi.GenericTagger(mecab_option)
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize("NFKC", text)
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+        for word in self.mecab(text):
+            token = word.surface
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+            tokens.append(token)
+        return tokens
+class SudachiTokenizer:
+    """Runs basic tokenization with Sudachi morphological parser."""
+    def __init__(
+        self,
+        do_lower_case=False,
+        never_split=None,
+        normalize_text=True,
+        trim_whitespace=False,
+        sudachi_split_mode="A",
+        sudachi_config_path=None,
+        sudachi_resource_dir=None,
+        sudachi_dict_type="core",
+    ):
+        """
+        Constructs a SudachiTokenizer.
+        Args:
+            **do_lower_case**: (*optional*) boolean (default True)
+                Whether to lowercase the input.
+            **never_split**: (*optional*) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
+            **normalize_text**: (*optional*) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+            **trim_whitespace**: (*optional*) boolean (default False)
+                Whether to trim all whitespace, tab, newline from tokens.
+            **sudachi_split_mode**: (*optional*) string
+                Split mode of sudachi, choose from "A", "B", "C".
+            **sudachi_config_path**: (*optional*) string
+            **sudachi_resource_dir**: (*optional*) string
+            **sudachi_dict_type**: (*optional*) string
+                dict type of sudachi, choose from "small", "core", "full".
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+        self.trim_whitespace = trim_whitespace
+        try:
+            from sudachipy import dictionary, tokenizer
+        except ImportError:
+            raise ImportError(
+                "You need to install sudachipy to use SudachiTokenizer. "
+                "See https://github.com/WorksApplications/SudachiPy for installation."
+            )
+        if sudachi_split_mode == "A":
+            self.split_mode = tokenizer.Tokenizer.SplitMode.A
+        elif sudachi_split_mode == "B":
+            self.split_mode = tokenizer.Tokenizer.SplitMode.B
+        elif sudachi_split_mode == "C":
+            self.split_mode = tokenizer.Tokenizer.SplitMode.C
+        else:
+            raise ValueError("Invalid sudachi_split_mode is specified.")
+        self.sudachi = dictionary.Dictionary(
+            config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
+        ).create(self.split_mode)
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize("NFKC", text)
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+        for word in self.sudachi.tokenize(text):
+            token = word.surface()
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+            if self.trim_whitespace:
+                if token.strip() == "":
+                    continue
+                else:
+                    token = token.strip()
+            tokens.append(token)
+        return tokens
+class JumanppTokenizer:
+    """Runs basic tokenization with jumanpp morphological parser."""
+    def __init__(
+        self,
+        do_lower_case=False,
+        never_split=None,
+        normalize_text=True,
+        trim_whitespace=False,
+    ):
+        """
+        Constructs a JumanppTokenizer.
+        Args:
+            **do_lower_case**: (*optional*) boolean (default True)
+                Whether to lowercase the input.
+            **never_split**: (*optional*) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
+            **normalize_text**: (*optional*) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+            **trim_whitespace**: (*optional*) boolean (default False)
+                Whether to trim all whitespace, tab, newline from tokens.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+        self.trim_whitespace = trim_whitespace
+        try:
+            import rhoknp
+        except ImportError:
+            raise ImportError(
+                "You need to install rhoknp to use JumanppTokenizer. "
+                "See https://github.com/ku-nlp/rhoknp for installation."
+            )
+        self.juman = rhoknp.Jumanpp()
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize("NFKC", text)
+        text = text.strip()
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+        for mrph in self.juman.apply_to_sentence(text).morphemes:
+            token = mrph.text
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+            if self.trim_whitespace:
+                if token.strip() == "":
+                    continue
+                else:
+                    token = token.strip()
+            tokens.append(token)
+        return tokens
+class CharacterTokenizer:
+    """Runs Character tokenization."""
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """
+        Constructs a CharacterTokenizer.
+        Args:
+            **vocab**:
+                Vocabulary object.
+            **unk_token**: str
+                A special symbol for out-of-vocabulary token.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into characters.
+        For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through *BasicTokenizer*.
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize("NFKC", text)
+        output_tokens = []
+        for char in text:
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+            output_tokens.append(char)
+        return output_tokens
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+class SentencepieceTokenizer(object):
+    """
+    Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer.
+    """
+    def __init__(
+        self,
+        vocab,
+        unk_token,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab)
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+        return outputs
+    def tokenize(self, text):
+        """
+        Tokenizes text by sentencepiece. Based on [SentencePiece](https://github.com/google/sentencepiece).
+        Tokenization needs the given vocabulary.
+        Args:
+            text: A string needs to be tokenized.
+        Returns:
+            A list of sentencepiece tokens.
+        """
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+        return new_pieces

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d8b678bd9ce14363d5e6e3dc53c7dd7c76f8ad11394ccffd200f15e9f6819eb
+size 274894944

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfafc8c0662d9c8f39621a64c74260f2ad120310c8dd24886de2dddaf599b4e
+size 439391

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "do_lower_case":true,
+    "remove_space":true,
+    "keep_accents":true,
+    "bos_token": "[CLS]",
+    "eos_token": "[SEP]",
+    "unk_token": "<unk>",
+    "sep_token": "[SEP]",
+    "pad_token": "<pad>",
+    "cls_token": "[CLS]",
+    "mask_token":{
+       "content":"[MASK]",
+       "single_word":false,
+       "lstrip":true,
+       "rstrip":false,
+       "normalized":false,
+       "__type":"AddedToken"
+    },
+    "tokenize_chinese_chars":false,
+    "tokenizer_class": "BertJapaneseTokenizer",
+    "word_tokenizer_type": "mecab",
+    "subword_tokenizer_type": "sentencepiece",
+    "mecab_kwargs": {
+        "mecab_dic": "unidic_lite"
+    },
+    "auto_map": {
+        "AutoTokenizer": [
+            "distilbert_japanese_tokenizer.DistilBertJapaneseTokenizer",
+            null
+        ]
+     }
+}