What am I doing wrong?
I am assuming this is a chat model?
I tried this with 4bit quants with this simple script:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
def main():
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # quantization type; "nf4" is commonly used
bnb_4bit_compute_dtype=torch.float16
)
model_name = "LumiOpen/Poro-34B-chat-OpenAssistant"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model with 4bit
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quant_config,
device_map="cuda",
torch_dtype=torch.float16
)
model.eval()
while True:
prompt = input("User: ")
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
# Generate a response
outputs = model.generate(**inputs, max_new_tokens=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Model response:")
print(response)
# repeats question?!
if __name__ == "__main__":
main()
Unfortunately the outputs[0] seems only to have my prompt input? Is there a prompt template I should use or what is wrong?
I'm sorry, I don't know much about quantized models, but the Poro chat models use ChatML and have the chat format defined in the tokenizer, so you should use the huggingface transformers facilities to work with chat models. I haven't tested the code, but it should be something like this:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load the model and tokenizer
model_name = "LumiOpen/Poro-34B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16, # Use bfloat16 for efficiency
device_map="auto" # Automatically handle device mapping
)
# Create a chat conversation
messages = [
{"role": "user", "content": "Hello, can you introduce yourself?"}
]
# Format the messages using the chat template in the tokenizer
input_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True # Include the assistant's response prompt
)
# Tokenize the formatted input
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
# Generate a response
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode the generated output
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(response)
Thanks! It seems, that setting the max_new_tokens a bit further (repeating the prompt) already produced output:
|prompter|>Osaatko kertoa Turusta?<|endoftext|><|assistant|>Toki! Turku on Suomen vanhin kaupunki ja sijaitsee Lounais-Suomessa, Aurajoen suulla. Se on kaunis ja historiallinen kaupunki, jossa on rikas >kulttuuriperintö. Turussa on paljon nähtävää ja tehtävää, kuten Turun linna, Tuomiokirkko ja Aboa Vetus & Ars Nova -museo. Kaupunki on myös tunnettu vilkkaasta yöelämästään, herkullisesta ruoastaan ja kauniista >rannikostaan. Jos etsit hauskaa ja kulttuurisesti rikasta kaupunkilomaa, Turku on ehdottomasti vierailun arvoinen!
However, it took quantisized to 4 bits 61 mins with 4080 with 16 GB VRAM to answer max 200 tokens...
I have to test if better prompting makes faster inference, I think my tokenizer is correct though...
I hope you can distill the model If you get access to Lumi, some applications in Finland need finnish LLM