""" 这个 bug 已解决, https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/commit/e03d54f05b9d42740c43a191c5d2914fcfb4c6e5 """ import os from transformers import AutoTokenizer from vocab import TokenizerType CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Baichuan2-7B-Chat") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True) # tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", trust_remote_code=True) token_ids = tokenizer.encode("") # token1 = tokenizer.decode(125696) # out of range token_ids = tokenizer.encode("中") filtered_tokens = tokenizer.convert_ids_to_tokens(token_ids) decoded_text = tokenizer.convert_tokens_to_string(filtered_tokens) print(decoded_text)