File size: 4,729 Bytes
5c5a02d 1f5aa72 afc67e5 1f5aa72 afc67e5 0018e62 db8df50 afc67e5 0018e62 62f6f1c 0018e62 2b6e2f8 0018e62 2b6e2f8 0018e62 7eb2d52 0018e62 2b6e2f8 0018e62 7eb2d52 62f6f1c 7eb2d52 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 2b6e2f8 62f6f1c 0018e62 62f6f1c 0018e62 62f6f1c 0018e62 afc67e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
---
license: apache-2.0
base_model:
- Qwen/Qwen2-VL-2B-Instruct
---
# Requirements
This is compatible with any onnx runtime.
# Running this model
**Javascript**
See https://huggingface.co/spaces/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo.
**Python**
Download the following script ./infer.py and then run like so:
python3 infer.py Qwen/Qwen2-VL-2B-Instruct 'path-to/Qwen2-VL-2B-Instruct-onnx/onnx'
```
import os
import sys
import time
import torch
import numpy as np
import requests
import onnxruntime as ort
from PIL import Image
from io import BytesIO
from transformers import Qwen2VLConfig, AutoTokenizer
# Command line arguments
model_path = sys.argv[1]
onnx_path = sys.argv[2]
# Initialize model config and tokenizer
model_config = Qwen2VLConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Model configuration
max_length = 1024
num_attention_heads = model_config.num_attention_heads
num_key_value_heads = model_config.num_key_value_heads
head_dim = model_config.hidden_size // num_attention_heads
num_layers = model_config.num_hidden_layers
# Setup ONNX sessions
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Model paths and sessions
models = ['A', 'B', 'C', 'D', 'E']
model_paths = {m: os.path.join(onnx_path, f'QwenVL_{m}_q4f16.onnx') for m in models}
sessions = {m: ort.InferenceSession(path, sess_options=session_options) for m, path in model_paths.items()}
# Input/output names
inputs = {
'A': sessions['A'].get_inputs()[0].name,
'B': [sessions['B'].get_inputs()[i].name for i in range(2)],
'C': sessions['C'].get_inputs()[0].name,
'D': [inp.name for inp in sessions['D'].get_inputs()],
'E': [inp.name for inp in sessions['E'].get_inputs()]
}
outputs = {
'A': sessions['A'].get_outputs()[0].name,
'B': sessions['B'].get_outputs()[0].name,
'C': sessions['C'].get_outputs()[0].name,
'D': [out.name for out in sessions['D'].get_outputs()],
'E': [out.name for out in sessions['E'].get_outputs()]
}
# Process image
image_url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
image = Image.open(BytesIO(requests.get(image_url).content)).resize((960, 960)).convert('RGB')
image_array = np.expand_dims(np.transpose(np.array(image).astype(np.float32), (2, 0, 1)), axis=0) / 255.
# Prepare inputs
prompt = "Describe this image."
formatted_prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer(formatted_prompt, return_tensors='pt')['input_ids']
input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
tokens = np.zeros(max_length, dtype=np.int32)
tokens[:input_ids.shape[1]] = input_ids[0, :]
position = np.zeros(1, dtype=np.int64)
# Initialize caches
key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
value_cache = key_cache.copy()
# Process initial inputs
hidden_states = sessions['B'].run(
[outputs['B']],
{inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
)[0]
batch_size = np.array(0, dtype=np.int32)
batch_size, = sessions['C'].run([outputs['C']], {inputs['C']: batch_size})
# Process image features
image_features = sessions['A'].run([outputs['A']], {inputs['A']: image_array})[0]
total_ids = 100 # 10 * 10 from original factors
input_lengths += total_ids
remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
tokens_to_stop = np.array(input_lengths[0] - 5, dtype=np.int32)
hidden_states, batch_size = sessions['D'].run(
outputs['D'],
dict(zip(inputs['D'],
[hidden_states, image_features, input_lengths, tokens_to_stop, remaining_tokens]))
)
# Generate tokens
start_time = time.time()
for i in range(12): # MAX_ITERATIONS
token, key_cache, value_cache = sessions['E'].run(
outputs['E'],
dict(zip(inputs['E'],
[hidden_states, np.array([-65504. if i==0 else 0.], dtype=np.float16),
key_cache, value_cache, position, input_lengths, batch_size,
np.array([1-total_ids+10 if i==0 else position[0]+1], dtype=np.float16)]))
)
if token in [151643, 151645]: # End tokens
break
if i < 1:
position += input_lengths[0]
input_lengths[0] = 1
else:
position += 1
tokens[0] = token
hidden_states = sessions['B'].run(
[outputs['B']],
{inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
)[0]
print(tokenizer.decode(token), end='', flush=True)
print(f"\nTotal time: {time.time() - start_time:.2f}s")
```
# Technical Information:
- [EXPORT.md](EXPORT.md) |