File size: 819 Bytes
7e0667c
 
 
 
 
 
 
 
 
 
 
 
 
 
ea756ed
f8718e6
 
 
8890eed
 
 
ea756ed
8890eed
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False
)

sents = ['选择珠江花园的原因就是方便。',
'笔记本的键盘确实爽。',
'房间太小。其他的都一般。',
'今天才知道这书还有第6卷,真有点郁闷。',
'机器背面似平被撕了张什么标签,残胶还在。']

# simple clause enc & dec
enc_out = tokenizer.encode(text=sents[0], text_pair=sents[1], truncation=True, padding='max_length', add_special_tokens=True, max_length=30, return_tensors=None)
print(enc_out)
dec_out = tokenizer.decode(enc_out)
print(dec_out)


# add new token into dict
all_dict = tokenizer.get_vocab()
print('dict length:', len(all_dict))
print('月光' in all_dict)