facing alot issues while running
below is proper format to run this
from transformers import AutoTokenizer
import onnxruntime as ort
import numpy as np
Load tokenizer
model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4"
tokenizer = AutoTokenizer.from_pretrained(model_name)
Ensure the pad_token is set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token or '[PAD]'
Load ONNX model
onnx_model_path = r"C:\Users\P53.cache\huggingface\hub\models--nvidia--Meta-Llama-3.2-3B-Instruct-ONNX-INT4\snapshots\2ee24420483cc401fb83feff92298518d9ae10d1\model.onnx"
session = ort.InferenceSession(onnx_model_path)
Inspect the ONNX model inputs
required_inputs = [inp.name for inp in session.get_inputs() if 'past_key_values' in inp.name]
num_layers = len(required_inputs) // 2 # Divide by 2 (key and value for each layer)
Define text generation
def generate_text(prompt, max_length=50):
# Tokenize the input
inputs = tokenizer(prompt, return_tensors="np", padding=True, truncation=True)
input_ids = inputs["input_ids"].astype(np.int64) # Convert to int64
attention_mask = inputs["attention_mask"].astype(np.int64) # Convert to int64
position_ids = np.arange(input_ids.shape[1])[None, :].astype(np.int64) # Convert to int64
# Initialize past_key_values
past_key_values = [
np.zeros((input_ids.shape[0], 8, input_ids.shape[1], 128), dtype=np.float16) # Adjusted dimensions and type
for _ in range(num_layers * 2) # For both keys and values
]
# Generate text
for _ in range(max_length):
# Prepare model inputs
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"position_ids": position_ids,
}
# Add past_key_values to inputs
for i in range(num_layers):
inputs[f"past_key_values.{i}.key"] = past_key_values[i * 2]
inputs[f"past_key_values.{i}.value"] = past_key_values[i * 2 + 1]
# Run the model
outputs = session.run(None, inputs)
logits = outputs[0] # Get logits
next_token_id = np.argmax(logits[:, -1, :], axis=-1).reshape(-1, 1)
# Update input_ids, attention_mask, and position_ids
input_ids = np.hstack((input_ids, next_token_id))
attention_mask = np.hstack((attention_mask, np.ones((input_ids.shape[0], 1), dtype=np.int64)))
position_ids = np.hstack((position_ids, position_ids[:, -1:] + 1))
# Stop if EOS token is generated
if next_token_id[0, 0] == tokenizer.eos_token_id:
break
return tokenizer.decode(input_ids[0], skip_special_tokens=True)
Test example
prompt = "Explain the concept of gravity in simple terms:"
output = generate_text(prompt)
print(output)