import torch from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "Qwen/Qwen2.5-Math-7B-Instruct" device = "cuda" if torch.cuda.is_available() else "cpu" # Modell und Tokenizer laden model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", low_cpu_mem_usage=True, torch_dtype=torch.float16, trust_remote_code=True ).eval() tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Eingabe vorbereiten chat = [ {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."}, {"role": "user", "content": "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"} ] # Vorbereiten der Eingabe conversation_str = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) input_ids = tokenizer.encode(conversation_str, return_tensors="pt", add_special_tokens=False).to(device) # attention_mask erstellen attention_mask = (input_ids != tokenizer.pad_token_id).long() # Inferenz durchführen try: with torch.no_grad(): outputs = model.generate( input_ids=input_ids, max_new_tokens=100, # Passe dies an, je nach Bedarf attention_mask=attention_mask ) print("Antwort generiert:", tokenizer.decode(outputs[0], skip_special_tokens=True)) except Exception as e: print(f"Fehler bei der Inferenz: {e}")