conan1024hao's picture
support sentencepiece tokenizer
bbf4b6d
raw
history blame contribute delete
656 Bytes
{
"do_lower_case": false,
"remove_space": true,
"keep_accents": true,
"bos_token": "[CLS]",
"eos_token": "[SEP]",
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": {
"content": "[MASK]",
"single_word": false,
"lstrip": true,
"rstrip": false,
"normalized": false,
"__type": "AddedToken"
},
"sp_model_kwargs": {},
"special_tokens_map_file": null,
"tokenizer_class": "BertJapaneseTokenizer",
"word_tokenizer_type": "jumanpp",
"subword_tokenizer_type": "sentencepiece",
"jumanpp_kwargs": {}
}