# HF_tokenizer | |
# from tokenizers import Tokenizer | |
# tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json") | |
import sentencepiece as spm | |
text = "nice job 华为手机" | |
text = "<s>世界上最高的山是哪座山?</s><pad>" # 29871, 41334, 30528, 30210, 30329, 41894, 31780, 30329, 30882, | |
tokenizer = spm.SentencePieceProcessor(model_file="tokenizer/tokenizer.model") | |
tokens = tokenizer.encode(text) # [7575, 4982, 29871, 31266, 30573, 30880, 31429] | |
print(tokens) | |
from transformers import LlamaTokenizer | |
tokenizer = LlamaTokenizer.from_pretrained("tokenizer") | |
tokens = tokenizer.encode(text) # [1, 7575, 4982, 29871, 31266, 30573, 30880, 31429] | |
print(tokens) | |