Spaces:

pdufour
/

Qwen2-VL-2B-Instruct-ONNX-Q4-F16

Running

File size: 11,192 Bytes

4c4928f
 
 
94a91a9
b2ca260
50f59bb
37f2943
 
 
 
 
4c42c50
 
37f2943
94a91a9
b2ca260
15ad92b
5ef5062
b2ca260
 
 
4492c02
0c484c0
350ff37
b2ca260
44a6e1e
0629524
e6643db
6eb05ef
94a91a9
1477d49
b2ca260
 
1477d49
 
 
 
94a91a9
1477d49
 
 
 
94a91a9
1477d49
 
 
 
b2ca260
0629524
 
b2ca260
 
 
fcbf7ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2ca260
f9c460b
fcbf7ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2ca260
fcbf7ba
 
 
 
 
b2ca260
5ef5062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94a91a9
 
5ef5062
abaea80
 
 
 
 
 
37f2943
abaea80
 
 
37f2943
abaea80
 
 
37f2943
 
 
 
abaea80
 
 
37f2943
 
 
 
 
 
 
 
abaea80
37f2943
 
abaea80
 
 
 
 
 
 
9a35308
37f2943
 
 
 
 
 
 
 
abaea80
 
 
 
 
37f2943
 
 
 
 
 
abaea80
 
 
 
41c8086
 
 
 
 
abaea80
 
 
37f2943
abaea80
37f2943
 
abaea80
fde6d8b
abaea80
 
 
 
 
 
37f2943
 
abaea80
 
 
 
37f2943
 
abaea80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1477d49
abaea80
 
 
37f2943
 
abaea80
 
1477d49
 
abaea80
 
 
37f2943
abaea80
 
1477d49
37f2943
1477d49
37f2943
abaea80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37f2943
 
 
 
 
 
 
abaea80
37f2943
abaea80
 
 
37f2943
 
abaea80
 
37f2943
abaea80
37f2943
abaea80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37f2943
abaea80
37f2943
abaea80
 
 
 
 
 
8fa488a
 
 
abaea80
 
37f2943
1477d49
37f2943
76ac858
 
 
 
4492c02
 
 
76ac858
 
fcbf7ba
 
 
e6643db
fcbf7ba
e6643db
 
fcbf7ba
 
e6643db
fcbf7ba
 
ab198d8
fcbf7ba
2cfb880
 
d1563c4
e6643db
fcbf7ba
 
5ef5062
 
2012cbe
 
 
 
350ff37
 
 
 
5ef5062
350ff37

import { env, AutoTokenizer, RawImage, Tensor } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
import { getModelJSON, getModelFile } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";

const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
const INPUT_IMAGE_SIZE = [960, 960];
const HEIGHT_FACTOR = 10;
const WIDTH_FACTOR = 10;
const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
const MAX_SEQ_LENGTH = 1024;
const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
const ONNX_MODEL = "pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16";
const QUANT = "q4f16";
const MAX_SINGLE_CHAT_LENGTH = 10;

// UI Elements
const exampleButton = document.getElementById('example');
const promptInput = document.querySelector('input[type="text"]');
const status = document.getElementById('status');
const imageContainer = document.getElementById('container');
const example = document.getElementById('example');
const thumb = document.getElementById('thumb');
const uploadInput = document.getElementById('upload');
const form = document.getElementById('form');

let ortSessionA, ortSessionB, ortSessionC, ortSessionD, ortSessionE;
let config;
let currentImage = '';
let currentQuery = '';

async function initializeSessions() {
  status.textContent = 'Loading model...';
  
  ortSessionA = await ort.InferenceSession.create(
    await getModelFile(ONNX_MODEL, `onnx/QwenVL_A_${QUANT}.onnx`),
    { executionProviders: ["webgpu"] }
  );

  ortSessionB = await ort.InferenceSession.create(
    await getModelFile(ONNX_MODEL, `onnx/QwenVL_B_${QUANT}.onnx`),
    { executionProviders: ["webgpu"] }
  );

  ortSessionC = await ort.InferenceSession.create(
    await getModelFile(ONNX_MODEL, `onnx/QwenVL_C_${QUANT}.onnx`),
    { executionProviders: ["webgpu"] }
  );

  config = (await getModelJSON(BASE_MODEL, "config.json"));

  status.textContent = 'Ready';
}

export function int64ToFloat16(int64Value) {
  // Convert BigInt to Number (float64)
  const float64Value = Number(int64Value);

  // Handle special cases
  if (!isFinite(float64Value)) return float64Value > 0 ? 0x7c00 : 0xfc00; // +/- infinity
  if (float64Value === 0) return 0; // Zero is represented as 0

  // Get sign, exponent, and mantissa from float64
  const sign = float64Value < 0 ? 1 : 0;
  const absValue = Math.abs(float64Value);
  const exponent = Math.floor(Math.log2(absValue));
  const mantissa = absValue / Math.pow(2, exponent) - 1;

  // Convert exponent and mantissa to float16 format
  const float16Exponent = exponent + 15; // Offset exponent by 15 (float16 bias)
  const float16Mantissa = Math.round(mantissa * 1024); // 10-bit mantissa for float16

  // Handle overflow/underflow
  if (float16Exponent <= 0) {
    // Subnormal numbers (exponent <= 0)
    return (sign << 15) | (float16Mantissa >> 1);
  } else if (float16Exponent >= 31) {
    // Overflow, set to infinity
    return (sign << 15) | 0x7c00;
  } else {
    // Normalized numbers
    return (sign << 15) | (float16Exponent << 10) | (float16Mantissa & 0x3ff);
  }
}

export function float16ToInt64(float16Value) {
  // Extract components from float16
  const sign = (float16Value & 0x8000) >> 15;
  const exponent = (float16Value & 0x7c00) >> 10;
  const mantissa = float16Value & 0x03ff;

  // Handle special cases
  if (exponent === 0 && mantissa === 0) return BigInt(0); // Zero
  if (exponent === 0x1f) return sign ? BigInt("-Infinity") : BigInt("Infinity"); // Infinity

  // Convert back to number
  let value;
  if (exponent === 0) {
    // Subnormal numbers
    value = Math.pow(2, -14) * (mantissa / 1024);
  } else {
    // Normalized numbers
    value = Math.pow(2, exponent - 15) * (1 + mantissa / 1024);
  }

  // Apply sign
  value = sign ? -value : value;

  return BigInt(Math.round(value));
}

// async function parse(img, txt) {
//   imageContainer.innerHTML = '';
//   imageContainer.style.backgroundImage = `url(${img})`;
//   status.textContent = 'Analysing...';
//   const output = await imageTextToText(img, txt);
//   status.textContent = output;
// }

async function handleQuery(imageUrl, query) {
  if (!imageUrl || !query.trim()) {
    status.textContent = 'Please provide both an image and a prompt';
    return;
  }
  
  try {
    status.textContent = 'Analyzing...';

    // container.style.backgroundImage = `url(${imageUrl})`;
    // container.style.backgroundSize = 'contain';
    // container.style.backgroundRepeat = 'no-repeat';
    // container.style.backgroundPosition = 'center';

    updatePreview(imageUrl);
    
    const result = await imageTextToText(imageUrl, query);
    status.textContent = result;
  } catch (err) {
    status.textContent = 'Error processing request';
    console.error(err);
  }
}


export async function imageTextToText(
  imagePath,
  query,
  vision = true
) {

  const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);

  let position_ids;
  let num_decode = 0;
  let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);

  var pos_factor_v = BigInt(1 - IMAGE_EMBED_SIZE + WIDTH_FACTOR);

  let past_key_states = new ort.Tensor(
    "float16",
    new Uint16Array(
      config.num_hidden_layers *
        config.num_key_value_heads *
        MAX_SEQ_LENGTH *
        (config.hidden_size / config.num_attention_heads)
    ).fill(0),
    [
      config.num_hidden_layers,
      config.num_key_value_heads,
      MAX_SEQ_LENGTH,
      config.hidden_size / config.num_attention_heads,
    ]
  );

  let past_value_states = past_key_states;

  let attention_mask = new ort.Tensor(
    "float16",
    new Uint16Array([0xfbff]),
    [1]
  );

  let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
  
  const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
  const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
  const token = await tokenizer(prompt, {
    return_tensors: "pt",
    add_generation_prompt: false,
    tokenize: true,
  }).input_ids;

  const seq_length = token.dims[1];
  let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
    1,
  ]);

  let input_ids = new ort.Tensor(
    "int32",
    new Int32Array(MAX_SEQ_LENGTH).fill(0),
    [MAX_SEQ_LENGTH]
  );

  input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));

  const dummy = new ort.Tensor("int32", new Int32Array([0]), []);

  let { hidden_states } = await ortSessionB.run({
    input_ids: input_ids,
    ids_len: ids_len,
  });

  ({ position_ids } = await ortSessionC.run({
    dummy: dummy,
  }));

  // Process image
  if (vision) {
    let image = await RawImage.fromURL(imagePath);

    image = await image.resize(INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1]);

    image = image.rgb();

    image = image.toTensor("CHW");
    image = image.to("float32");
    image = image.div_(255.0);
    const pixel_values = image.unsqueeze(0);

    const { image_embed } = await ortSessionA.run({
      pixel_values: pixel_values,
    });

    ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));

    const split_factor = new Tensor(
      "int32",
      new Int32Array([
        MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
      ]),
      [1]
    );

    const ids_len_minus = new Tensor(
      "int32",
      new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
      [1]
    );

    await ortSessionA.release();
    ortSessionA = null;

    ortSessionD = await ort.InferenceSession.create(
      await getModelFile(ONNX_MODEL, `onnx/QwenVL_D_${QUANT}.onnx`),
      {
        executionProviders: ["webgpu"],
      }
    );

    ({ hidden_states, position_ids } = await ortSessionD.run({
      "hidden_states.1": hidden_states,
      image_embed,
      ids_len,
      ids_len_minus,
      split_factor,
    }));

    await ortSessionD.release();
    ortSessionD = null;
  }

  let output = '';

  while (
    num_decode < MAX_SINGLE_CHAT_LENGTH &&
    Number(history_len.data[0]) < MAX_SEQ_LENGTH
  ) {
    let token_id;

    if (!ortSessionE) {
      ortSessionE = await ort.InferenceSession.create(
        await getModelFile(ONNX_MODEL, `onnx/QwenVL_E_${QUANT}.onnx`),
        {
          executionProviders: ["wasm"],
        },
      );
    }

    ({
      max_logit_ids: token_id,
      past_key_states: past_key_states,
      past_value_states: past_value_states,
    } = await ortSessionE.run({
      hidden_states,
      attention_mask,
      "past_key_states.1": past_key_states,
      "past_value_states.1": past_value_states,
      history_len,
      ids_len,
      position_ids,
      pos_factor,
    }));

    if (token_id === 151643 || token_id === 151645) {
      break;
    }

    num_decode++;
    if (num_decode < 2) {
      history_len = history_len.add(BigInt(ids_len.data[0]));

      ids_len = new ort.Tensor("int64", new BigInt64Array([1n]), [1]);

      attention_mask = new ort.Tensor("float16", new Uint16Array([0]), [1]);

      if (vision) {
        pos_factor = new Tensor(
          "float16",
          new Uint16Array([int64ToFloat16(pos_factor_v + ids_len.data[0])]),
          [1]
        );
      } else {
        pos_factor = new Tensor(
          "float16",
          new Uint16Array([int64ToFloat16(history_len.data[0] + BigInt(1))]),
          [1]
        );
      }

    } else {
      history_len = history_len.add(BigInt(1));
      pos_factor = pos_factor.map((v) =>
        int64ToFloat16(float16ToInt64(v) + BigInt(1))
      );
    }
    (input_ids.data)[0] = Number(token_id.data[0]);

    const result_B = await ortSessionB.run({
      input_ids: input_ids,
      ids_len: ids_len,
    });
    hidden_states = result_B.hidden_states;

    if (
      !Number.isInteger(token_id.data[0]) &&
      !["bigint", "number"].includes(typeof token_id.data[0])
    ) {
      throw new Error(`Token ID is not an integer`);
    } else {
      const decoded = tokenizer.decode([...token_id.data]);
      console.log({decoded});
      
      output += decoded;
    }
  }
}

async function updatePreview(url) {
  const image = await RawImage.fromURL(url);
  const ar = image.width / image.height;
  const [cw, ch] = (ar > 1) ? [640, 640 / ar] : [640 * ar, 640];
  thumb.style.width = `${cw}px`;
  thumb.style.height = `${ch}px`;
  thumb.style.backgroundImage = `url(${url})`;
}

await initializeSessions();

// UI Event Handlers
exampleButton.addEventListener('click', (e) => {
  e.preventDefault();
  e.stopPropagation();
  currentImage = EXAMPLE_URL;
});

uploadInput.addEventListener('change', (e) => {
  const file = e.target.files[0];
  if (!file) return;

  const reader = new FileReader();
  reader.onload = (e2) => {
    currentImage = e2.target.result;
    updatePreview(currentImage);
  };
  reader.readAsDataURL(file);
});


form.addEventListener('submit', (e) => {
  e.preventDefault();
  e.stopPropagation();
  
  if (!currentImage || !currentQuery) {
    status.textContent = 'Please select an image and type a prompt';
  } else {
    handleQuery(currentImage, currentQuery);
  }
});