import { AutoProcessor, Qwen2VLForConditionalGeneration, RawImage } from "@huggingface/transformers"; const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"; const exampleButton = document.getElementById('example'); const promptInput = document.querySelector('input[type="text"]'); const status = document.getElementById('status'); const thumb = document.getElementById('thumb'); const uploadInput = document.getElementById('upload'); const form = document.getElementById('form'); const output = document.getElementById('llm-output'); const dtypeSelect = document.getElementById('dtype-select'); const loadModelButton = document.getElementById('load-model'); const container = document.getElementById('container'); let currentImage = ''; let currentQuery = ''; const model_id = "onnx-community/Qwen2-VL-2B-Instruct"; let processor; let model; async function initializeSessions() { loadModelButton.textContent = 'Loading Model...'; loadModelButton.classList.add('loading'); container.classList.add('disabled'); processor = await AutoProcessor.from_pretrained(model_id); const dtype = dtypeSelect.value; const options = { device: 'webgpu', }; if (dtype) { options.dtype = dtype; } options['transformers.js_config'] = {}; options['transformers.js_config']['kv_cache_dtype'] = 'float16'; model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, options); loadModelButton.textContent = 'Model Ready'; loadModelButton.classList.remove('loading'); loadModelButton.classList.add('ready'); dtypeSelect.disabled = true; uploadInput.disabled = false; promptInput.disabled = false; container.classList.remove('disabled'); } async function handleQuery(imageUrl, query) { try { loadModelButton.textContent = 'Processing...'; const result = await imageTextToText(imageUrl, query, (out) => { console.log({ out }); output.textContent = out; }); loadModelButton.textContent = 'Model Ready'; } catch (err) { loadModelButton.textContent = 'Error'; console.error(err); } } async function imageTextToText( imagePath, query, cb, ) { const image = await (await RawImage.read(imagePath)).resize(448, 448); const conversation = [ { role: "user", content: [ { type: "image" }, { type: "text", text: query, }, ], images: [image], }, ]; const text = processor.apply_chat_template(conversation, { add_generation_prompt: true }); const inputs = await processor(text, image); const outputs = await model.generate({ ...inputs, max_new_tokens: 128, }); const decoded = processor.batch_decode( outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true }, ); cb(decoded); return decoded; } async function updatePreview(url) { const image = await RawImage.fromURL(url); const ar = image.width / image.height; const [cw, ch] = (ar > 1) ? [320, 320 / ar] : [320 * ar, 320]; thumb.style.width = `${cw}px`; thumb.style.height = `${ch}px`; thumb.style.backgroundImage = `url(${url})`; thumb.innerHTML = ''; } loadModelButton.addEventListener('click', async () => { dtypeSelect.disabled = true; loadModelButton.disabled = true; await initializeSessions(); }); // UI Event Handlers exampleButton.addEventListener('click', (e) => { e.preventDefault(); currentImage = EXAMPLE_URL; updatePreview(currentImage); }); uploadInput.addEventListener('change', (e) => { const file = e.target.files[0]; if (!file) return; const reader = new FileReader(); reader.onload = (e2) => { currentImage = e2.target.result; updatePreview(currentImage); }; reader.readAsDataURL(file); }); promptInput.addEventListener('keypress', (e) => { currentQuery = e.target.value; }); form.addEventListener('submit', (e) => { e.preventDefault(); if (!currentImage || !currentQuery) { loadModelButton.textContent = 'Please select an image and type a prompt'; setTimeout(() => { loadModelButton.textContent = 'Model Ready'; }, 2000); } else { promptInput.disabled = true; uploadInput.disabled = true; handleQuery(currentImage, currentQuery); } });