File size: 2,926 Bytes
033909d c9cb3b8 033909d c9cb3b8 4129883 c9cb3b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
---
license: cc-by-nc-sa-4.0
datasets:
- wikipedia
- cc100
language:
- ja
library_name: transformers
pipeline_tag: fill-mask
---
Japanese BERT-base (Vaporetto + WordPiece)
===
## How to load the tokenizer
Please download the dictionary file for Vaporetto + WordPiece from [our GitHub repository](https://github.com/hitachi-nlp/compare-ja-tokenizer/blob/public/data/dict/vaporetto_wordpiece.json).
Then you can load the tokenizer by specifying the path of the dictionary file to `dict_path`.
```python
from typing import Optional
from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast
import vaporetto
import textspan
class VaporettoPreTokenizer:
def __init__(self, unidic_path: str):
with open(unidic_path, 'rb') as fp:
model = fp.read()
self.tokenizer = vaporetto.Vaporetto(model, predict_tags=False)
def tokenize(self, sequence: str) -> list[str]:
tokens = self.tokenizer.tokenize(sequence)
return [token.surface() for token in tokens]
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
text = str(normalized_string)
tokens = self.tokenize(text)
tokens_spans = textspan.get_original_spans(tokens, text)
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.custom_split)
# load a pre-tokenizer
pre_tokenizer = VaporettoPreTokenizer("/path/to/bccwj-suw+unidic+tag.model.zst")
# load a tokenizer
dict_path = /path/to/vaporetto_wordpiece.json
tokenizer = Tokenizer.from_file(dict_path)
tokenizer.post_processor = BertProcessing(
cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
)
# convert to PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
unk_token='[UNK]',
cls_token='[CLS]',
sep_token='[SEP]',
pad_token='[PAD]',
mask_token='[MASK]'
)
# set a pre-tokenizer
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
```
```python
# Test
test_str = "γγγ«γ‘γ―γη§γ―ε½’ζ
η΄ θ§£ζε¨γ«γ€γγ¦η η©Άγγγ¦γγΎγγ"
tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids)
# -> ['[CLS]','γ','##γ','##γ«','##γ‘','##γ―','γ','η§','γ―','ε½’ζ
','η΄ ','解ζ','ε¨','γ«','γ€γ','γ¦','η η©Ά','γ','γ','γ¦','γ','γΎγ','γ','[SEP]']
```
## How to load the model
```python
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_vaporetto-wordpiece")
```
**See [our repository](https://github.com/hitachi-nlp/compare-ja-tokenizer) for more details!**
|