Spaces:

pdufour
/

Qwen2-VL-2B-Instruct-ONNX-Q4-F16

Running

App Files Files Community

pdufour commited on Nov 19

Commit

b2ca260

•

1 Parent(s): 4c4928f

Update index.js

Browse files

Files changed (1) hide show

index.js +36 -2

index.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { env, AutoTokenizer, RawImage, Tensor } from 'https://cdn.jsdelivr.net/n
 import { getModelJSON, getModelFile } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
 import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";
 const INPUT_IMAGE_SIZE = [960, 960];
 const HEIGHT_FACTOR = 10;
 const WIDTH_FACTOR = 10;
@@ -12,9 +13,17 @@ const ONNX_MODEL = "pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16";
 const QUANT = "q4f16";
 const MAX_SINGLE_CHAT_LENGTH = 10;
 let ortSessionA, ortSessionB, ortSessionC;
 async function initializeSessions() {
   ortSessionA = await ort.InferenceSession.create(
     await getModelFile(ONNX_MODEL, `onnx/QwenVL_A_${QUANT}.onnx`),
     { executionProviders: ["webgpu"] }
@@ -29,9 +38,35 @@ async function initializeSessions() {
     await getModelFile(ONNX_MODEL, `onnx/QwenVL_C_${QUANT}.onnx`),
     { executionProviders: ["webgpu"] }
   );
 }
-export async function imageTextToText(imagePath, query, vision = true) {
   const config = await getModelJSON(BASE_MODEL, "config.json");
   const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
@@ -72,7 +107,6 @@ export async function imageTextToText(imagePath, query, vision = true) {
   );
   input_ids.data.set(Array.from(token.data.slice(0, token.dims[1]), Number));
-  // Get position IDs from Session C
   const dummy = new ort.Tensor("int32", new Int32Array([0]), []);
   let { position_ids } = await ortSessionC.run({ dummy });

 import { getModelJSON, getModelFile } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
 import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";
+const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
 const INPUT_IMAGE_SIZE = [960, 960];
 const HEIGHT_FACTOR = 10;
 const WIDTH_FACTOR = 10;
 const QUANT = "q4f16";
 const MAX_SINGLE_CHAT_LENGTH = 10;
+// UI Elements
+const status = document.getElementById('status');
+const fileUpload = document.getElementById('upload');
+const imageContainer = document.getElementById('container');
+const example = document.getElementById('example');
 let ortSessionA, ortSessionB, ortSessionC;
 async function initializeSessions() {
+  status.textContent = 'Loading model...';
   ortSessionA = await ort.InferenceSession.create(
     await getModelFile(ONNX_MODEL, `onnx/QwenVL_A_${QUANT}.onnx`),
     { executionProviders: ["webgpu"] }
     await getModelFile(ONNX_MODEL, `onnx/QwenVL_C_${QUANT}.onnx`),
     { executionProviders: ["webgpu"] }
   );
+  status.textContent = 'Ready';
+}
+// UI Event Handlers
+example.addEventListener('click', (e) => {
+  e.preventDefault();
+  parse(EXAMPLE_URL, 'Describe this image.');
+});
+fileUpload.addEventListener('change', function(e) {
+  const file = e.target.files[0];
+  if (!file) return;
+  const reader = new FileReader();
+  reader.onload = e2 => parse(e2.target.result, '');
+  reader.readAsDataURL(file);
+});
+async function parse(img, txt) {
+  imageContainer.innerHTML = '';
+  imageContainer.style.backgroundImage = `url(${img})`;
+  status.textContent = 'Analysing...';
+  const output = await imageTextToText(img, txt);
+  status.textContent = '';
+  imageContainer.textContent = output;
 }
+async function imageTextToText(imagePath, query, vision = true) {
   const config = await getModelJSON(BASE_MODEL, "config.json");
   const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
   );
   input_ids.data.set(Array.from(token.data.slice(0, token.dims[1]), Number));
   const dummy = new ort.Tensor("int32", new Int32Array([0]), []);
   let { position_ids } = await ortSessionC.run({ dummy });