Paul Dufour
Update demo
ab1f0c5
import { AutoProcessor, Qwen2VLForConditionalGeneration, RawImage } from "@huggingface/transformers";
const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
const exampleButton = document.getElementById('example');
const promptInput = document.querySelector('input[type="text"]');
const status = document.getElementById('status');
const thumb = document.getElementById('thumb');
const uploadInput = document.getElementById('upload');
const form = document.getElementById('form');
const output = document.getElementById('llm-output');
const dtypeSelect = document.getElementById('dtype-select');
const loadModelButton = document.getElementById('load-model');
const container = document.getElementById('container');
let currentImage = '';
let currentQuery = '';
const model_id = "onnx-community/Qwen2-VL-2B-Instruct";
let processor;
let model;
async function initializeSessions() {
loadModelButton.textContent = 'Loading Model...';
loadModelButton.classList.add('loading');
container.classList.add('disabled');
processor = await AutoProcessor.from_pretrained(model_id);
const dtype = dtypeSelect.value;
const options = { device: 'webgpu', };
if (dtype) {
options.dtype = dtype;
}
options['transformers.js_config'] = {};
options['transformers.js_config']['kv_cache_dtype'] = 'float16';
model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, options);
loadModelButton.textContent = 'Model Ready';
loadModelButton.classList.remove('loading');
loadModelButton.classList.add('ready');
dtypeSelect.disabled = true;
uploadInput.disabled = false;
promptInput.disabled = false;
container.classList.remove('disabled');
}
async function handleQuery(imageUrl, query) {
try {
loadModelButton.textContent = 'Processing...';
const result = await imageTextToText(imageUrl, query, (out) => {
console.log({ out });
output.textContent = out;
});
loadModelButton.textContent = 'Model Ready';
} catch (err) {
loadModelButton.textContent = 'Error';
console.error(err);
}
}
async function imageTextToText(
imagePath,
query,
cb,
) {
const image = await (await RawImage.read(imagePath)).resize(448, 448);
const conversation = [
{
role: "user",
content: [
{ type: "image" },
{ type: "text", text: query, },
],
images: [image],
},
];
const text = processor.apply_chat_template(conversation, { add_generation_prompt: true });
const inputs = await processor(text, image);
const outputs = await model.generate({
...inputs,
max_new_tokens: 128,
});
const decoded = processor.batch_decode(
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true },
);
cb(decoded);
return decoded;
}
async function updatePreview(url) {
const image = await RawImage.fromURL(url);
const ar = image.width / image.height;
const [cw, ch] = (ar > 1) ? [320, 320 / ar] : [320 * ar, 320];
thumb.style.width = `${cw}px`;
thumb.style.height = `${ch}px`;
thumb.style.backgroundImage = `url(${url})`;
thumb.innerHTML = '';
}
loadModelButton.addEventListener('click', async () => {
dtypeSelect.disabled = true;
loadModelButton.disabled = true;
await initializeSessions();
});
// UI Event Handlers
exampleButton.addEventListener('click', (e) => {
e.preventDefault();
currentImage = EXAMPLE_URL;
updatePreview(currentImage);
});
uploadInput.addEventListener('change', (e) => {
const file = e.target.files[0];
if (!file) return;
const reader = new FileReader();
reader.onload = (e2) => {
currentImage = e2.target.result;
updatePreview(currentImage);
};
reader.readAsDataURL(file);
});
promptInput.addEventListener('keypress', (e) => {
currentQuery = e.target.value;
});
form.addEventListener('submit', (e) => {
e.preventDefault();
if (!currentImage || !currentQuery) {
loadModelButton.textContent = 'Please select an image and type a prompt';
setTimeout(() => {
loadModelButton.textContent = 'Model Ready';
}, 2000);
} else {
promptInput.disabled = true;
uploadInput.disabled = true;
handleQuery(currentImage, currentQuery);
}
});