import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cuda:0" | |
tokenizer = AutoTokenizer.from_pretrained("glm-4-voice-9b", trust_remote_code=True) | |
tokenizer.chat_template = "{{role}}: {{content}}" | |
query = "你好" | |
inputs = tokenizer.apply_chat_template([{"role": "user", "content": query}], | |
add_generation_prompt=True, | |
tokenize=True, | |
return_tensors="pt", | |
return_dict=True | |
) | |
inputs = inputs.to(device) | |
model = AutoModelForCausalLM.from_pretrained( | |
"glm-4-voice-9b", | |
low_cpu_mem_usage=True, | |
trust_remote_code=True, | |
load_in_4bit=True | |
).eval() | |
model.save_pretrained("glm-4-voice-9b-int4") | |
tokenizer.save_pretrained("glm-4-voice-9b-int4") |