import { env, AutoTokenizer, RawImage, Tensor } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
import { getModelJSON } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";

// Since we will download the model from the Hugging Face Hub, we can skip the local model check
env.allowLocalModels = false;

// Reference the elements that we will need
const status = document.getElementById('status');
const fileUpload = document.getElementById('upload');
const imageContainer = document.getElementById('container');
const example = document.getElementById('example');

const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
const INPUT_IMAGE_SIZE = [960, 960];
const HEIGHT_FACTOR = 10;
const WIDTH_FACTOR = 10;
const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
const MAX_SEQ_LENGTH = 1024;
const ONNX_URL = "http://localhost:3004/onnx";
const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
const QUANTIZATION = "q4f16";
const MAX_SINGLE_CHAT_LENGTH = 10;

status.textContent = 'Loading model...';
status.textContent = 'Ready';

example.addEventListener('click', (e) => {
    e.preventDefault();
    detect(EXAMPLE_URL);
});

fileUpload.addEventListener('change', function (e) {
    const file = e.target.files[0];
    if (!file) {
        return;
    }

    const reader = new FileReader();

    // Set up a callback when the file is loaded
    reader.onload = e2 => detect(e2.target.result);

    reader.readAsDataURL(file);
});


// Detect objects in the image
async function detect(img) {
    imageContainer.innerHTML = '';
    imageContainer.style.backgroundImage = `url(${img})`;

    status.textContent = 'Analysing...';
    const output = await detector(img, {
        threshold: 0.5,
        percentage: true,
    });
    status.textContent = '';
    output.forEach(renderBox);
}


export async function imageTextToText(
  imagePath,
  query,
  vision = true
) {
  const suffix = QUANTIZATION ? `_${QUANTIZATION}` : "";

  const config = (await getModelJSON(BASE_MODEL, "config.json"))

  const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);

  let position_ids;
  let num_decode = 0;
  let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);

  let past_key_states = new ort.Tensor(
    "float16",
    new Uint16Array(
      config.num_hidden_layers *
        config.num_key_value_heads *
        MAX_SEQ_LENGTH *
        (config.hidden_size / config.num_attention_heads)
    ).fill(0),
    [
      config.num_hidden_layers,
      config.num_key_value_heads,
      MAX_SEQ_LENGTH,
      config.hidden_size / config.num_attention_heads,
    ]
  );

  let past_value_states = past_key_states;

  let attention_mask = new ort.Tensor(
    "float16",
    new Uint16Array([0xfbff]), // -65504.0 in float16
    [1]
  );

  let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);

  const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
  const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
  const token = await tokenizer(prompt, {
    return_tensors: "pt",
    add_generation_prompt: false,
    tokenize: true,
  }).input_ids;

  const seq_length = token.dims[1];
  let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
    1,
  ]);

  let input_ids = new ort.Tensor(
    "int32",
    new Int32Array(MAX_SEQ_LENGTH).fill(0),
    [MAX_SEQ_LENGTH]
  );

  input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));

  if (vision) {
    let image = await RawImage.fromURL(imagePath);
    image = image.rgb().toTensor("CHW").to("float32").div_(255.0);
    const pixel_values = image.unsqueeze(0);

    const ortSessionA = await ort.InferenceSession.create(
      `${BASE_URL}/QwenVL_A${suffix}.onnx`,
      { executionProviders: ["webgpu"] }
    );

    const { image_embed } = await ortSessionA.run({ pixel_values });

    ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));

    const ortSessionD = await ort.InferenceSession.create(
      `${BASE_URL}/QwenVL_D${suffix}.onnx`,
      { executionProviders: ["webgpu"] }
    );

    ({ hidden_states: past_key_states, position_ids } =
      await ortSessionD.run({
        "hidden_states.1": past_key_states,
        image_embed,
        ids_len,
        "ids_len_minus": new Tensor(
          "int32",
          new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
          [1]
        ),
        "split_factor": new Tensor(
          "int32",
          new Int32Array([
            MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
          ]),
          [1]
        ),
      }));
  }

  const ortSessionB = await ort.InferenceSession.create(
    `${BASE_URL}/QwenVL_B${suffix}.onnx`,
    { executionProviders: ["webgpu"] }
  );

  while (
    num_decode < MAX_SINGLE_CHAT_LENGTH &&
    Number(history_len.data[0]) < MAX_SEQ_LENGTH
  ) {
    const ortSessionE = await ort.InferenceSession.create(
      `${BASE_URL}/QwenVL_E_q4f16.onnx`,
      { executionProviders: ["wasm"] }
    );

    const result = await ortSessionE.run({
      hidden_states: past_key_states,
      attention_mask,
      "past_key_states.1": past_key_states,
      "past_value_states.1": past_value_states,
      history_len,
      ids_len,
      position_ids,
      pos_factor,
    });

    const token_id = result.max_logit_ids;
    if (token_id === 151643 || token_id === 151645) break;

    num_decode++;

    history_len = history_len.add(BigInt(1));
    pos_factor = new Tensor(
      "float16",
      new Uint16Array([Number(pos_factor.data[0]) + 1]),
      [1]
    );

    past_key_states = result.past_key_states;
    past_value_states = result.past_value_states;

    input_ids.data[0] = Number(token_id.data[0]);
    const { hidden_states } = await ortSessionB.run({
      input_ids,
      ids_len,
    });

    past_key_states = hidden_states;
  }
}