Update bert_token_handler.py
Browse files- bert_token_handler.py +8 -1
bert_token_handler.py
CHANGED
@@ -12,7 +12,14 @@ sents = ['选择珠江花园的原因就是方便。',
|
|
12 |
'今天才知道这书还有第6卷,真有点郁闷。',
|
13 |
'机器背面似平被撕了张什么标签,残胶还在。']
|
14 |
|
|
|
15 |
enc_out = tokenizer.encode(text=sents[0], text_pair=sents[1], truncation=True, padding='max_length', add_special_tokens=True, max_length=30, return_tensors=None)
|
16 |
print(enc_out)
|
17 |
dec_out = tokenizer.decode(enc_out)
|
18 |
-
print(dec_out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
'今天才知道这书还有第6卷,真有点郁闷。',
|
13 |
'机器背面似平被撕了张什么标签,残胶还在。']
|
14 |
|
15 |
+
-- simple clause enc & dec
|
16 |
enc_out = tokenizer.encode(text=sents[0], text_pair=sents[1], truncation=True, padding='max_length', add_special_tokens=True, max_length=30, return_tensors=None)
|
17 |
print(enc_out)
|
18 |
dec_out = tokenizer.decode(enc_out)
|
19 |
+
print(dec_out)
|
20 |
+
|
21 |
+
|
22 |
+
-- add new token into dict
|
23 |
+
all_dict = tokenizer.get_vocab()
|
24 |
+
print('dict length:', len(all_dict))
|
25 |
+
print('月光' in all_dict)
|