Initial release

Browse files

Files changed (7) hide show

README.md +73 -0
config.json +45 -0
pytorch_model.bin +3 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +15 -0
tokenization_bart_japanese.py +314 -0
tokenizer_config.json +22 -0

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+---
+language:
+- ja
+license: mit
+tags:
+- bart
+- pytorch
+datasets:
+- wikipedia
+---
+# bart-large-japanese
+This model is converted from the original [Japanese BART Pretrained model](https://nlp.ist.i.kyoto-u.ac.jp/?BART%E6%97%A5%E6%9C%AC%E8%AA%9EPretrained%E3%83%A2%E3%83%87%E3%83%AB) released by Kyoto University.
+Both the encoder and decoder outputs are identical to the original Fairseq model.
+### How to use the model
+The input text should be tokenized by [BartJapaneseTokenizer](https://huggingface.co/Formzu/bart-large-japanese/blob/main/tokenization_bart_japanese.py).
+Tokenizer requirements:
+* [Juman++](https://github.com/ku-nlp/jumanpp)
+* [zenhan](https://pypi.org/project/zenhan/)
+* [pyknp](https://pypi.org/project/pyknp/)
+* [sentencepiece](https://pypi.org/project/sentencepiece/)
+#### Simple FillMaskPipeline
+```python
+from transformers import AutoModelForSeq2SeqLM, pipeline
+from tokenization_bart_japanese import BartJapaneseTokenizer
+model_name = "Formzu/bart-large-japanese"
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+tokenizer = BartJapaneseTokenizer.from_pretrained(model_name)
+masked_text = "天気が<mask>から散歩しましょう。"
+fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+out = fill_mask(masked_text)
+print(out)
+# [{'score': 0.03228279948234558, 'token': 2566, 'token_str': 'いい', 'sequence': '天気 が いい から 散歩 し ましょう 。'},
+#  {'score': 0.023878807201981544, 'token': 27365, 'token_str': '晴れ', 'sequence': '天気 が 晴れ から 散歩 し ましょう 。'},
+#  {'score': 0.020059829577803612, 'token': 267, 'token_str': '南', 'sequence': '天気 が 南 から 散歩 し ましょう 。'},
+#  {'score': 0.013921134173870087, 'token': 17, 'token_str': 'な', 'sequence': '天気 が な から 散歩 し ましょう 。'},
+#  {'score': 0.013069136068224907, 'token': 1718, 'token_str': 'よく', 'sequence': '天気 が よく から 散歩 し ましょう 。'}]
+```
+#### Text Generation
+```python
+from transformers import AutoModelForSeq2SeqLM
+from tokenization_bart_japanese import BartJapaneseTokenizer
+import torch
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model_name = "Formzu/bart-large-japanese"
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+tokenizer = BartJapaneseTokenizer.from_pretrained(model_name)
+masked_text = "天気が<mask>から散歩しましょう。"
+inp = tokenizer(masked_text, return_tensors='pt').to(device)
+out = model.generate(**inp, num_beams=1, min_length=0, max_length=20, early_stopping=True,  no_repeat_ngram_size=2)
+res = "".join(tokenizer.decode(out.squeeze(0).tolist(), skip_special_tokens=True).split(" "))
+print(res)
+# 天気がいいから散歩しましょう。天気のいいへやから、ここから
+```
+### Framework versions
+- Transformers 4.21.2
+- Pytorch 1.12.1+cu116
+- Tokenizers 0.12.1

config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_name_or_path": "bart-large-japanese",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "add_final_layer_norm": true,
+  "architectures": [
+    "MBartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "mbart",
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.2",
+  "use_cache": true,
+  "vocab_size": 32002
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:844c40c88119301c1baa0dc28e6084914dd83de2dcd3bc04a297181fafa19c0c
+size 1550669945

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff9226612d029bfade0621f401cb605740dc0a8ca88400e89ffdce26702ee266
+size 588767

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenization_bart_japanese.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from contextlib import contextmanager
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers import AddedToken, PreTrainedTokenizer
+from transformers import logging
+logger = logging.get_logger(__name__)
+SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "Formzu/bart-base-japanese": (
+            "https://huggingface.co/Formzu/bart-base-japanese/resolve/main/sentencepiece.bpe.model"
+        ),
+        "Formzu/bart-large-japanese": (
+            "https://huggingface.co/Formzu/bart-large-japanese/resolve/main/sentencepiece.bpe.model"
+        ),
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "Formzu/bart-base-japanese": 1024,
+    "Formzu/bart-large-japanese": 1024,
+}
+class BartJapaneseTokenizer(PreTrainedTokenizer):
+    """
+    Construct a BART tokenizer for Japanese text.
+    Adapted from [`RobertaTokenizer`], [`XLNetTokenizer`] and [`MBartTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+    The tokenization method is `<bos> <tokens> <eos>`.
+    Examples:
+    ```python
+    >>> from tokenization_bart_japanese import BartJapaneseTokenizer
+    >>> tokenizer = BartJapaneseTokenizer.from_pretrained("Formzu/bart-base-japanese")
+    >>> example_japanese_phrase = "今日は晴れています。"
+    >>> expected_label = "天気"
+    >>> inputs = tokenizer(example_japanese_phrase, return_tensors="pt")
+    >>> labels = tokenizer(expected_label, return_tensors="pt")
+    >>> inputs["labels"] = labels["input_ids"]
+    ```"""
+    vocab_files_names = VOCAB_FILES_NAMES
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        tokenizer_file=None,
+        src_lang=None,
+        tgt_lang=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            tokenizer_file=None,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+        try:
+            from zenhan import h2z
+        except ModuleNotFoundError as error:
+            raise error.__class__(
+                "You need to install zenhan to use BartJapaneseTokenizer."
+                "See https://pypi.org/project/zenhan/ for installation."
+            )
+        try:
+            from pyknp import Juman
+        except ModuleNotFoundError as error:
+            raise error.__class__(
+                "You need to install pyknp to use BartJapaneseTokenizer."
+                "See https://pypi.org/project/pyknp/ for installation."
+            )
+        self.h2z = h2z
+        self.jumanpp = Juman()
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |    4   |   5    |   6    |   7    |   8    |   9
+        # -------- | ------- | ------- | ------ | ------- | ------ | ------ | ------ | ------ | ------ | ------
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' |　'▁の'　|　'▁、'　|　'▁。'　|　'▁に'　|　'▁は'　|　'▁を'
+        # spm      | '<unk>' | '<s>'   | '</s>' |　'▁の'　 |　'▁、'　|　'▁。'　|　'▁に'　|　'▁は'　|　'▁を'　|　'▁と'
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+        # The first "real" token "▁の" has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+        self.sp_model_size = len(self.sp_model)
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self.set_special_tokens()
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + 1  # Plus 1 for the mask token
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Japanese BART sequence has the following format, where `X` represents the sequence:
+        - `input_ids` (for encoder) `[bos] X [eos]`
+        - `decoder_input_ids`: (for decoder) `[bos] X [eos]`
+        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Japanese BART does not
+        make use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text: str) -> List[str]:
+        text = text
+        text = self.h2z(text)
+        text = self.jumanpp.analysis(text)
+        text = ' '.join([mrph.midasi for mrph in text.mrph_list()])
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def set_special_tokens(self) -> None:
+        """Set prefix=[bos], suffix=[eos]."""
+        self.prefix_tokens = [self.bos_token_id]
+        self.suffix_tokens = [self.eos_token_id]
+        self.add_tokens(self.all_special_tokens_extended, special_tokens=True)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "additional_special_tokens": null,
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "src_lang": null,
+  "tgt_lang": null,
+  "tokenizer_class": "BartJapaneseTokenizer",
+  "tokenizer_file": null,
+  "unk_token": "<unk>"
+}