update non-slim versions of models

Browse files

Files changed (7) hide show

Makefile +24 -10
infer.py +120 -0
onnx/QwenVL_A.onnx +2 -2
onnx/QwenVL_A.onnx.data +2 -2
onnx/QwenVL_B.onnx +2 -2
onnx/QwenVL_C.onnx +2 -2
onnx/QwenVL_D.onnx +2 -2

Makefile CHANGED Viewed

@@ -1,7 +1,7 @@
 SHELL := /bin/bash
 # Configuration variables
-NATIVE_ANDROID = ../Native-LLM-for-Android
 QWEN_VL_DIR = $(NATIVE_ANDROID)/Export_ONNX/QwenVL
 ONNX_SRC_DIR = $(QWEN_VL_DIR)/onnx
 ONNX_DEST_DIR = $(QWEN_VL_DIR)/onnx-dist
@@ -60,11 +60,12 @@ export-merged-source-models-second-pass:
 		$(NATIVE_PYTHON) -c 'import onnx, os, sys; \
 			src = """'"$$item"'"""; \
 			total_size = os.path.getsize(src); \
 			total_size += os.path.getsize(src + ".data") if os.path.exists(src + ".data") else 0; \
 			needs_external = total_size > 2e9; \
 			onnx.save_model( \
 				onnx.load(src), \
-				src, \
 				save_as_external_data=needs_external, \
 				all_tensors_to_one_file=True, \
 				location=(os.path.basename(src) + ".data") if needs_external else None \
@@ -75,7 +76,7 @@ export-merged-source-models-second-pass:
 	echo "✅ Done second models"
-all-in-one: export quantize clean-large-files slim fix-gpu-buffers
 	@echo "✨ All done! ONNX models exported, slimmed, quantized and fixed"
 export: export-abcd export-e
@@ -93,9 +94,16 @@ export-e:
 slim:
 	@echo "🗜️  Slimming ONNX models..."
-	@files=$$(find $(ONNX_SRC_DIR) -name "*.onnx" -type f ! -name "QwenVL_E.onnx"); \
-	$(call progress_bar,$$files,onnxslim --verbose {} {})
-	@echo "✅ Slimming complete"
 quantize:
 	@echo "⚡ Starting quantization..."
@@ -144,7 +152,13 @@ clean-large-files:
 fix-gpu-buffers:
 	@echo "🔧 Fixing GPU buffers for E models..."
-	cd $(NATIVE_ANDROID) && \
-	files=$$(find $(ONNX_DEST_DIR) -name "QwenVL_E_*.onnx" -type f); \
-	$(call progress_bar,$$files, .venv/bin/python3 ONNX_Tools/clamp_for_gpu_buffers.py --overwrite {})
-	@echo "✅ GPU buffer fixes complete"

 SHELL := /bin/bash
 # Configuration variables
+NATIVE_ANDROID = $(abspath ../Native-LLM-for-Android)
 QWEN_VL_DIR = $(NATIVE_ANDROID)/Export_ONNX/QwenVL
 ONNX_SRC_DIR = $(QWEN_VL_DIR)/onnx
 ONNX_DEST_DIR = $(QWEN_VL_DIR)/onnx-dist
 		$(NATIVE_PYTHON) -c 'import onnx, os, sys; \
 			src = """'"$$item"'"""; \
 			total_size = os.path.getsize(src); \
+			d = os.path.join(dest_dir, os.path.basename(src)); \
 			total_size += os.path.getsize(src + ".data") if os.path.exists(src + ".data") else 0; \
 			needs_external = total_size > 2e9; \
 			onnx.save_model( \
 				onnx.load(src), \
+				d, \
 				save_as_external_data=needs_external, \
 				all_tensors_to_one_file=True, \
 				location=(os.path.basename(src) + ".data") if needs_external else None \
 	echo "✅ Done second models"
+all-in-one: export quantize clean-large-files fix-gpu-buffers export-merged-source-models
 	@echo "✨ All done! ONNX models exported, slimmed, quantized and fixed"
 export: export-abcd export-e
 slim:
 	@echo "🗜️  Slimming ONNX models..."
+	@files=`find $(ONNX_SRC_DIR) -name "*.onnx" -type f ! -name "QwenVL_E.onnx"`; \
+	total=`echo "$$files" | wc -w | tr -d ' '`; \
+	echo "Files found: $$total"; \
+	current=0; \
+	for item in $$files; do \
+		current=$$((current + 1)); \
+		$(call progress_bar,$$current,$$total,$$item); \
+		onnxslim --verbose "$$item" "$$item" || exit 1; \
+	done; \
+	echo "✅ Slimming complete"
 quantize:
 	@echo "⚡ Starting quantization..."
 fix-gpu-buffers:
 	@echo "🔧 Fixing GPU buffers for E models..."
+	@files=`find $(ONNX_DEST_DIR) -name "QwenVL_E_*.onnx" -type f`; \
+	total=`echo "$$files" | wc -w | tr -d ' '`; \
+	echo "Files found: $$total"; \
+	current=0; \
+	for item in $$files; do \
+		current=$$((current + 1)); \
+		$(call progress_bar,$$current,$$total,$$item); \
+		cd $(NATIVE_ANDROID) && .venv/bin/python3 ONNX_Tools/clamp_for_gpu_buffers.py --overwrite "$$item" || exit 1; \
+	done; \
+	echo "✅ GPU buffer fixes complete"

infer.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import sys
+import time
+import torch
+import numpy as np
+import requests
+import onnxruntime as ort
+from PIL import Image
+from io import BytesIO
+from transformers import Qwen2VLConfig, AutoTokenizer
+# Command line arguments
+model_path = sys.argv[1]
+onnx_path = sys.argv[2]
+# Initialize model config and tokenizer
+model_config = Qwen2VLConfig.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+# Model configuration
+max_length = 1024
+num_attention_heads = model_config.num_attention_heads
+num_key_value_heads = model_config.num_key_value_heads
+head_dim = model_config.hidden_size // num_attention_heads
+num_layers = model_config.num_hidden_layers
+# Setup ONNX sessions
+session_options = ort.SessionOptions()
+session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+# Model paths and sessions
+models = ['A', 'B', 'C', 'D', 'E']
+model_paths = {m: os.path.join(onnx_path, f'QwenVL_{m}_q4f16.onnx') for m in models}
+sessions = {m: ort.InferenceSession(path, sess_options=session_options) for m, path in model_paths.items()}
+# Input/output names
+inputs = {
+    'A': sessions['A'].get_inputs()[0].name,
+    'B': [sessions['B'].get_inputs()[i].name for i in range(2)],
+    'C': sessions['C'].get_inputs()[0].name,
+    'D': [inp.name for inp in sessions['D'].get_inputs()],
+    'E': [inp.name for inp in sessions['E'].get_inputs()]
+}
+outputs = {
+    'A': sessions['A'].get_outputs()[0].name,
+    'B': sessions['B'].get_outputs()[0].name,
+    'C': sessions['C'].get_outputs()[0].name,
+    'D': [out.name for out in sessions['D'].get_outputs()],
+    'E': [out.name for out in sessions['E'].get_outputs()]
+}
+# Process image
+image_url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
+image = Image.open(BytesIO(requests.get(image_url).content)).resize((960, 960)).convert('RGB')
+image_array = np.expand_dims(np.transpose(np.array(image).astype(np.float32), (2, 0, 1)), axis=0) / 255.
+# Prepare inputs
+prompt = "Describe this image."
+formatted_prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{prompt}<|im_end|>\n<|im_start|>assistant\n"
+input_ids = tokenizer(formatted_prompt, return_tensors='pt')['input_ids']
+input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
+tokens = np.zeros(max_length, dtype=np.int32)
+tokens[:input_ids.shape[1]] = input_ids[0, :]
+position = np.zeros(1, dtype=np.int64)
+# Initialize caches
+key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
+value_cache = key_cache.copy()
+# Process initial inputs
+hidden_states = sessions['B'].run(
+    [outputs['B']],
+    {inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
+)[0]
+batch_size = np.array(0, dtype=np.int32)
+batch_size, = sessions['C'].run([outputs['C']], {inputs['C']: batch_size})
+# Process image features
+image_features = sessions['A'].run([outputs['A']], {inputs['A']: image_array})[0]
+total_ids = 100  # 10 * 10 from original factors
+input_lengths += total_ids
+remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
+tokens_to_stop = np.array(input_lengths[0] - 5, dtype=np.int32)
+hidden_states, batch_size = sessions['D'].run(
+    outputs['D'],
+    dict(zip(inputs['D'],
+        [hidden_states, image_features, input_lengths, tokens_to_stop, remaining_tokens]))
+)
+# Generate tokens
+start_time = time.time()
+for i in range(12):  # MAX_ITERATIONS
+    token, key_cache, value_cache = sessions['E'].run(
+        outputs['E'],
+        dict(zip(inputs['E'],
+            [hidden_states, np.array([-65504. if i==0 else 0.], dtype=np.float16),
+             key_cache, value_cache, position, input_lengths, batch_size,
+             np.array([1-total_ids+10 if i==0 else position[0]+1], dtype=np.float16)]))
+    )
+    if token in [151643, 151645]:  # End tokens
+        break
+    if i < 1:
+        position += input_lengths[0]
+        input_lengths[0] = 1
+    else:
+        position += 1
+    tokens[0] = token
+    hidden_states = sessions['B'].run(
+        [outputs['B']],
+        {inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
+    )[0]
+    print(tokenizer.decode(token), end='', flush=True)
+print(f"\nTotal time: {time.time() - start_time:.2f}s")

onnx/QwenVL_A.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7668776b6d8a7dbbd5344de5948f9e7040cce04ac4fafff9155204dd2e0ef561
-size 341395

 version https://git-lfs.github.com/spec/v1
+oid sha256:9359181d8a217fd066b6201ca88d39ceef8d84464e886fa3af3634b767807967
+size 22863481

onnx/QwenVL_A.onnx.data CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1bdde323eb76c15f6eab14966d5b802c51a8d9559d5260ad3cf9e868ef160bf
-size 5322682368

 version https://git-lfs.github.com/spec/v1
+oid sha256:48c2e8d0ebb88762b324860ca74abd35d4848b08f84619e71acc5122a0e46c8f
+size 5322170368

onnx/QwenVL_B.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b2a741d6586465346e5c552c1d375da0b8321dd76a4d5498c0dd267ccd523b6
-size 233983352

 version https://git-lfs.github.com/spec/v1
+oid sha256:9b752394955396a0684cb491ebf802645ad6e73a29f4f2392c6bfd77759d7d86
+size 234019162

onnx/QwenVL_C.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a239bb5f47b6589f4db8d9a3b57ada13cabee3508851769d473f3bd2338da732
-size 6384

 version https://git-lfs.github.com/spec/v1
+oid sha256:09090f067d75cbfb62f90fc1f783529ede85e07006da80681fbb6f535baa29d6
+size 10335

onnx/QwenVL_D.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d70b7429fc137486f82683d68953dd8a60d72466071fd22104bf5ff77e4460e
-size 25215

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4393146a8d328f1eae43e9058f391a1ef07048d6793747dab948838fcdfd1e6
+size 26762