# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. # # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from tokenizers import ByteLevelBPETokenizer from transformers import AutoTokenizer # Step 1: Initialize ByteLevelBPETokenizer #tokenizer = ByteLevelBPETokenizer( # "vocab.json", # "merges.txt" #) # Step 2: Save the tokenizer configuration #tokenizer.save_model("auto_model") # Step 3: Load the tokenizer using AutoTokenizer auto_tokenizer = AutoTokenizer.from_pretrained("./", use_fast=False, trust_remote_code=True) # Test the tokenizer text = "Hello, world!" encoded = auto_tokenizer.encode(text) decoded = auto_tokenizer.decode(encoded) print("Encoded:", encoded) print("Decoded:", decoded) messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello, how are you?"}, {"role": "assistant", "content": "I'm good, thank you! How can I help you today?"}, {"role": "user", "content": "Nothing"}, ] print('messages:', messages) ids = auto_tokenizer.apply_chat_template(messages) print(f"input_ids:\t{ids}") text = auto_tokenizer.decode(ids) print(f"input_text:\t[{text}]")