line-distilbert-base-japanese / tokenizer_config.json
kajyuuen's picture
Fix mecab_dic
fbeeaf3
raw
history blame
805 Bytes
{
"do_lower_case":true,
"remove_space":true,
"keep_accents":true,
"bos_token": "[CLS]",
"eos_token": "[SEP]",
"unk_token": "<unk>",
"sep_token": "[SEP]",
"pad_token": "<pad>",
"cls_token": "[CLS]",
"mask_token":{
"content":"[MASK]",
"single_word":false,
"lstrip":true,
"rstrip":false,
"normalized":false,
"__type":"AddedToken"
},
"tokenize_chinese_chars":false,
"tokenizer_class": "BertJapaneseTokenizer",
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "sentencepiece",
"mecab_kwargs": {
"mecab_dic": "unidic_lite"
},
"auto_map": {
"AutoTokenizer": [
"distilbert_japanese_tokenizer.DistilBertJapaneseTokenizer",
null
]
}
}