|
--- |
|
license: cc-by-nc-sa-4.0 |
|
datasets: |
|
- wikipedia |
|
- cc100 |
|
language: |
|
- ja |
|
library_name: transformers |
|
pipeline_tag: fill-mask |
|
--- |
|
|
|
Japanese BERT-base (Vaporetto + WordPiece) |
|
=== |
|
|
|
## How to load the tokenizer |
|
Please download the dictionary file for Vaporetto + WordPiece from [our GitHub repository](https://github.com/hitachi-nlp/compare-ja-tokenizer/blob/public/data/dict/vaporetto_wordpiece.json). |
|
Then you can load the tokenizer by specifying the path of the dictionary file to `dict_path`. |
|
|
|
```python |
|
from typing import Optional |
|
|
|
from tokenizers import Tokenizer, NormalizedString, PreTokenizedString |
|
from tokenizers.processors import BertProcessing |
|
from tokenizers.pre_tokenizers import PreTokenizer |
|
from transformers import PreTrainedTokenizerFast |
|
|
|
import vaporetto |
|
import textspan |
|
|
|
class VaporettoPreTokenizer: |
|
def __init__(self, unidic_path: str): |
|
with open(unidic_path, 'rb') as fp: |
|
model = fp.read() |
|
self.tokenizer = vaporetto.Vaporetto(model, predict_tags=False) |
|
|
|
def tokenize(self, sequence: str) -> list[str]: |
|
tokens = self.tokenizer.tokenize(sequence) |
|
return [token.surface() for token in tokens] |
|
|
|
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]: |
|
text = str(normalized_string) |
|
tokens = self.tokenize(text) |
|
tokens_spans = textspan.get_original_spans(tokens, text) |
|
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans] |
|
|
|
def pre_tokenize(self, pretok: PreTokenizedString): |
|
pretok.split(self.custom_split) |
|
|
|
# load a pre-tokenizer |
|
pre_tokenizer = VaporettoPreTokenizer("/path/to/bccwj-suw+unidic+tag.model.zst") |
|
|
|
# load a tokenizer |
|
dict_path = /path/to/vaporetto_wordpiece.json |
|
tokenizer = Tokenizer.from_file(dict_path) |
|
tokenizer.post_processor = BertProcessing( |
|
cls=("[CLS]", tokenizer.token_to_id('[CLS]')), |
|
sep=("[SEP]", tokenizer.token_to_id('[SEP]')) |
|
) |
|
|
|
# convert to PreTrainedTokenizerFast |
|
tokenizer = PreTrainedTokenizerFast( |
|
tokenizer_object=tokenizer, |
|
unk_token='[UNK]', |
|
cls_token='[CLS]', |
|
sep_token='[SEP]', |
|
pad_token='[PAD]', |
|
mask_token='[MASK]' |
|
) |
|
|
|
# set a pre-tokenizer |
|
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer) |
|
``` |
|
|
|
```python |
|
# Test |
|
test_str = "γγγ«γ‘γ―γη§γ―ε½’ζ
η΄ θ§£ζε¨γ«γ€γγ¦η η©Άγγγ¦γγΎγγ" |
|
tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids) |
|
# -> ['[CLS]','γ','##γ','##γ«','##γ‘','##γ―','γ','η§','γ―','ε½’ζ
','η΄ ','解ζ','ε¨','γ«','γ€γ','γ¦','η η©Ά','γ','γ','γ¦','γ','γΎγ','γ','[SEP]'] |
|
``` |
|
|
|
## How to load the model |
|
```python |
|
from transformers import AutoModelForMaskedLM |
|
model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_vaporetto-wordpiece") |
|
``` |
|
|
|
|
|
**See [our repository](https://github.com/hitachi-nlp/compare-ja-tokenizer) for more details!** |
|
|