pdufour commited on
Commit
0018e62
·
verified ·
1 Parent(s): 8fa9eab

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +243 -0
README.md CHANGED
@@ -7,7 +7,250 @@ base_model:
7
  This is compatible with any onnx runtime.
8
 
9
  # Running this model
 
 
 
10
  See https://huggingface.co/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo.
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Technical Information:
13
  - [EXPORT.md](EXPORT.md)
 
7
  This is compatible with any onnx runtime.
8
 
9
  # Running this model
10
+
11
+ **Javascript**
12
+
13
  See https://huggingface.co/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo.
14
 
15
+
16
+ **Python**
17
+
18
+ ```
19
+ import time
20
+ import torch
21
+ import numpy as np
22
+ import onnxruntime
23
+ from PIL import Image
24
+ import os
25
+ import sys
26
+ import requests
27
+ from io import BytesIO
28
+
29
+
30
+ try:
31
+ from export_config import INPUT_IMAGE_SIZE, IMAGE_RESIZE, MAX_SEQ_LENGTH, HEIGHT_FACTOR, WIDTH_FACTOR
32
+ except:
33
+ INPUT_IMAGE_SIZE = [960, 960]
34
+ HEIGHT_FACTOR = 10
35
+ WIDTH_FACTOR = 10
36
+ IMAGE_RESIZE = [HEIGHT_FACTOR * 28, WIDTH_FACTOR * 28]
37
+ MAX_SEQ_LENGTH = 1024
38
+
39
+ path = sys.argv[1]
40
+ script_dir = sys.argv[2]
41
+
42
+ onnx_model_A = os.path.join(script_dir, 'QwenVL_A.onnx')
43
+ onnx_model_B = os.path.join(script_dir, 'QwenVL_B_q4f16.onnx')
44
+ onnx_model_C = os.path.join(script_dir, 'QwenVL_C_q4f16.onnx')
45
+ onnx_model_D = os.path.join(script_dir, 'QwenVL_D_q4f16.onnx')
46
+ onnx_model_E = os.path.join(script_dir, 'QwenVL_E_q4f16.onnx')
47
+
48
+ print("\n[PATHS] ONNX model paths:")
49
+ print(f" Model A: {onnx_model_A}")
50
+ print(f" Model B: {onnx_model_B}")
51
+ print(f" Model C: {onnx_model_C}")
52
+ print(f" Model D: {onnx_model_D}")
53
+ print(f" Model E: {onnx_model_E}")
54
+
55
+ image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
56
+ query = "Describe this image."
57
+
58
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer
59
+
60
+ with torch.inference_mode():
61
+ model = Qwen2VLForConditionalGeneration.from_pretrained(path, torch_dtype=torch.float32, device_map="mps", low_cpu_mem_usage=True)
62
+ max_seq_len = MAX_SEQ_LENGTH
63
+ num_heads = model.config.num_attention_heads
64
+ num_key_value_heads = model.config.num_key_value_heads
65
+ head_dim = model.config.hidden_size // num_heads
66
+ num_layers = model.config.num_hidden_layers
67
+ hidden_size = model.config.hidden_size
68
+
69
+
70
+ max_single_chat_length = 12
71
+
72
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
73
+
74
+ session_opts = onnxruntime.SessionOptions()
75
+ session_opts.log_severity_level = 3
76
+ session_opts.inter_op_num_threads = 0
77
+ session_opts.intra_op_num_threads = 0
78
+ session_opts.enable_cpu_mem_arena = True
79
+ session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
80
+ session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
81
+ session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
82
+ session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
83
+
84
+ ort_session_A = onnxruntime.InferenceSession(onnx_model_A, sess_options=session_opts)
85
+ ort_session_B = onnxruntime.InferenceSession(onnx_model_B, sess_options=session_opts)
86
+ ort_session_C = onnxruntime.InferenceSession(onnx_model_C, sess_options=session_opts)
87
+ ort_session_D = onnxruntime.InferenceSession(onnx_model_D, sess_options=session_opts)
88
+ ort_session_E = onnxruntime.InferenceSession(onnx_model_E, sess_options=session_opts)
89
+
90
+ in_name_A = ort_session_A.get_inputs()
91
+ out_name_A = ort_session_A.get_outputs()
92
+ in_name_A0 = in_name_A[0].name
93
+ out_name_A0 = out_name_A[0].name
94
+
95
+ in_name_B = ort_session_B.get_inputs()
96
+ out_name_B = ort_session_B.get_outputs()
97
+ in_name_B0 = in_name_B[0].name
98
+ in_name_B1 = in_name_B[1].name
99
+ out_name_B0 = out_name_B[0].name
100
+
101
+ in_name_C = ort_session_C.get_inputs()
102
+ out_name_C = ort_session_C.get_outputs()
103
+ in_name_C0 = in_name_C[0].name
104
+ out_name_C0 = out_name_C[0].name
105
+
106
+ in_name_D = ort_session_D.get_inputs()
107
+ out_name_D = ort_session_D.get_outputs()
108
+ in_name_D0 = in_name_D[0].name
109
+ in_name_D1 = in_name_D[1].name
110
+ in_name_D2 = in_name_D[2].name
111
+ in_name_D3 = in_name_D[3].name
112
+ in_name_D4 = in_name_D[4].name
113
+ out_name_D0 = out_name_D[0].name
114
+ out_name_D1 = out_name_D[1].name
115
+
116
+ in_name_E = ort_session_E.get_inputs()
117
+ out_name_E = ort_session_E.get_outputs()
118
+ in_name_E0 = in_name_E[0].name
119
+ in_name_E1 = in_name_E[1].name
120
+ in_name_E2 = in_name_E[2].name
121
+ in_name_E3 = in_name_E[3].name
122
+ in_name_E4 = in_name_E[4].name
123
+ in_name_E5 = in_name_E[5].name
124
+ in_name_E6 = in_name_E[6].name
125
+ in_name_E7 = in_name_E[7].name
126
+ out_name_E0 = out_name_E[0].name
127
+ out_name_E1 = out_name_E[1].name
128
+ out_name_E2 = out_name_E[2].name
129
+
130
+ response = requests.get(image_url)
131
+ image = Image.open(BytesIO(response.content))
132
+
133
+ if image.mode != 'RGB':
134
+ image = image.convert('RGB')
135
+
136
+ pixel_values = np.transpose(np.array(image).astype(np.float32), (2, 0, 1))
137
+ pixel_values = np.expand_dims(pixel_values, axis=0) / 255.0
138
+ use_vision = True
139
+
140
+ prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{query}<|im_end|>\n<|im_start|>assistant\n"
141
+ prompt_head_len = np.array([5], dtype=np.int64)
142
+
143
+ image_embed_size = WIDTH_FACTOR * HEIGHT_FACTOR
144
+
145
+ token = tokenizer(prompt, return_tensors='pt')['input_ids']
146
+
147
+ ids_len = np.array([token.shape[1]], dtype=np.int64)
148
+
149
+ input_ids = np.zeros(max_seq_len, dtype=np.int32)
150
+ input_ids[:ids_len[0]] = token[0, :]
151
+
152
+ history_len = np.zeros(1, dtype=np.int64)
153
+
154
+ past_key_states = np.zeros((num_layers, num_key_value_heads, max_seq_len, head_dim), dtype=np.float16)
155
+
156
+ past_values_states = past_key_states
157
+
158
+ attention_mask = np.array([-65504.0], dtype=np.float16)
159
+
160
+ pos_factor = np.array([0.0], dtype=np.float16)
161
+
162
+ pos_factor_v = 1 - image_embed_size + WIDTH_FACTOR
163
+
164
+ dummy = np.array(0, dtype=np.int32)
165
+
166
+ hidden_states = ort_session_B.run(
167
+ [out_name_B0],
168
+ {
169
+ in_name_B0: input_ids,
170
+ in_name_B1: ids_len
171
+ })[0]
172
+
173
+ position_ids, = ort_session_C.run(
174
+ [out_name_C0],
175
+ {
176
+ in_name_C0: dummy
177
+ })
178
+
179
+ if use_vision:
180
+
181
+ image_embed = ort_session_A.run(
182
+ [out_name_A0],
183
+ {in_name_A0: pixel_values})[0]
184
+
185
+ ids_len += image_embed_size
186
+
187
+ split_factor = np.array(max_seq_len - ids_len[0] - image_embed_size, dtype=np.int32)
188
+
189
+ ids_len_minus = np.array(ids_len[0] - prompt_head_len[0], dtype=np.int32)
190
+
191
+
192
+ hidden_states, position_ids = ort_session_D.run(
193
+ [out_name_D0, out_name_D1],
194
+ {
195
+ in_name_D0: hidden_states,
196
+ in_name_D1: image_embed,
197
+ in_name_D2: ids_len,
198
+ in_name_D3: ids_len_minus,
199
+ in_name_D4: split_factor
200
+ })
201
+
202
+ end_time = time.time()
203
+
204
+ end_time = time.time()
205
+ num_decode = 0
206
+
207
+ while (num_decode < max_single_chat_length) & (history_len < max_seq_len):
208
+ token_id, past_key_states, past_values_states = ort_session_E.run(
209
+ [out_name_E0, out_name_E1, out_name_E2],
210
+ {
211
+ in_name_E0: hidden_states,
212
+ in_name_E1: attention_mask,
213
+ in_name_E2: past_key_states,
214
+ in_name_E3: past_values_states,
215
+ in_name_E4: history_len,
216
+ in_name_E5: ids_len,
217
+ in_name_E6: position_ids,
218
+ in_name_E7: pos_factor
219
+ })
220
+
221
+ if (token_id == 151643) | (token_id == 151645):
222
+ break
223
+ else:
224
+ num_decode += 1
225
+ if num_decode < 2:
226
+ history_len += ids_len[0]
227
+
228
+ ids_len[0] = 1
229
+
230
+ attention_mask = np.array([0.0], dtype=np.float16)
231
+
232
+ if use_vision:
233
+ pos_factor = np.array(pos_factor_v + ids_len[0], dtype=np.float16)
234
+ else:
235
+ pos_factor = np.array(history_len[0] + 1, dtype=np.float16)
236
+ else:
237
+ history_len += 1
238
+ pos_factor += 1
239
+
240
+ input_ids[0] = token_id
241
+ hidden_states = ort_session_B.run(
242
+ [out_name_B0],
243
+ {
244
+ in_name_B0: input_ids,
245
+ in_name_B1: ids_len
246
+ })[0]
247
+
248
+ decoded_token = tokenizer.decode(token_id)
249
+ print(f"Decoded token: {decoded_token}")
250
+ print(decoded_token, end="", flush=True)
251
+
252
+ generation_time = time.time() - end_time
253
+ ```
254
+
255
  # Technical Information:
256
  - [EXPORT.md](EXPORT.md)