Spaces:

mmhh888
/

llm_study

Sleeping

mmhh888 commited on Oct 12, 2023

Commit

8890eed

•

1 Parent(s): f8718e6

Update bert_token_handler.py

Files changed (1) hide show

bert_token_handler.py CHANGED Viewed

@@ -12,7 +12,14 @@ sents = ['选择珠江花园的原因就是方便。',
 '今天才知道这书还有第6卷，真有点郁闷。',
 '机器背面似平被撕了张什么标签，残胶还在。']
 enc_out = tokenizer.encode(text=sents[0], text_pair=sents[1], truncation=True, padding='max_length', add_special_tokens=True, max_length=30, return_tensors=None)
 print(enc_out)
 dec_out = tokenizer.decode(enc_out)
-print(dec_out)

 '今天才知道这书还有第6卷，真有点郁闷。',
 '机器背面似平被撕了张什么标签，残胶还在。']
+-- simple clause enc & dec
 enc_out = tokenizer.encode(text=sents[0], text_pair=sents[1], truncation=True, padding='max_length', add_special_tokens=True, max_length=30, return_tensors=None)
 print(enc_out)
 dec_out = tokenizer.decode(enc_out)
+print(dec_out)
+-- add new token into dict
+all_dict = tokenizer.get_vocab()
+print('dict length:', len(all_dict))
+print('月光' in all_dict)