|
import { AutoProcessor, Qwen2VLForConditionalGeneration, RawImage } from "@huggingface/transformers"; |
|
|
|
const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"; |
|
|
|
const exampleButton = document.getElementById('example'); |
|
const promptInput = document.querySelector('input[type="text"]'); |
|
const status = document.getElementById('status'); |
|
const thumb = document.getElementById('thumb'); |
|
const uploadInput = document.getElementById('upload'); |
|
const form = document.getElementById('form'); |
|
const output = document.getElementById('llm-output'); |
|
const dtypeSelect = document.getElementById('dtype-select'); |
|
const loadModelButton = document.getElementById('load-model'); |
|
const container = document.getElementById('container'); |
|
|
|
let currentImage = ''; |
|
let currentQuery = ''; |
|
const model_id = "onnx-community/Qwen2-VL-2B-Instruct"; |
|
let processor; |
|
let model; |
|
|
|
async function initializeSessions() { |
|
loadModelButton.textContent = 'Loading Model...'; |
|
loadModelButton.classList.add('loading'); |
|
container.classList.add('disabled'); |
|
|
|
processor = await AutoProcessor.from_pretrained(model_id); |
|
|
|
const dtype = dtypeSelect.value; |
|
const options = { device: 'webgpu', }; |
|
if (dtype) { |
|
options.dtype = dtype; |
|
} |
|
options['transformers.js_config'] = {}; |
|
options['transformers.js_config']['kv_cache_dtype'] = 'float16'; |
|
model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, options); |
|
|
|
loadModelButton.textContent = 'Model Ready'; |
|
loadModelButton.classList.remove('loading'); |
|
loadModelButton.classList.add('ready'); |
|
|
|
dtypeSelect.disabled = true; |
|
uploadInput.disabled = false; |
|
promptInput.disabled = false; |
|
container.classList.remove('disabled'); |
|
} |
|
|
|
async function handleQuery(imageUrl, query) { |
|
try { |
|
loadModelButton.textContent = 'Processing...'; |
|
|
|
const result = await imageTextToText(imageUrl, query, (out) => { |
|
console.log({ out }); |
|
output.textContent = out; |
|
}); |
|
|
|
loadModelButton.textContent = 'Model Ready'; |
|
} catch (err) { |
|
loadModelButton.textContent = 'Error'; |
|
console.error(err); |
|
} |
|
} |
|
|
|
async function imageTextToText( |
|
imagePath, |
|
query, |
|
cb, |
|
) { |
|
const image = await (await RawImage.read(imagePath)).resize(448, 448); |
|
const conversation = [ |
|
{ |
|
role: "user", |
|
content: [ |
|
{ type: "image" }, |
|
{ type: "text", text: query, }, |
|
], |
|
images: [image], |
|
}, |
|
]; |
|
const text = processor.apply_chat_template(conversation, { add_generation_prompt: true }); |
|
const inputs = await processor(text, image); |
|
|
|
const outputs = await model.generate({ |
|
...inputs, |
|
max_new_tokens: 128, |
|
}); |
|
|
|
const decoded = processor.batch_decode( |
|
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), |
|
{ skip_special_tokens: true }, |
|
); |
|
|
|
cb(decoded); |
|
|
|
return decoded; |
|
} |
|
|
|
async function updatePreview(url) { |
|
const image = await RawImage.fromURL(url); |
|
const ar = image.width / image.height; |
|
const [cw, ch] = (ar > 1) ? [320, 320 / ar] : [320 * ar, 320]; |
|
thumb.style.width = `${cw}px`; |
|
thumb.style.height = `${ch}px`; |
|
thumb.style.backgroundImage = `url(${url})`; |
|
thumb.innerHTML = ''; |
|
} |
|
|
|
loadModelButton.addEventListener('click', async () => { |
|
dtypeSelect.disabled = true; |
|
loadModelButton.disabled = true; |
|
await initializeSessions(); |
|
}); |
|
|
|
|
|
exampleButton.addEventListener('click', (e) => { |
|
e.preventDefault(); |
|
currentImage = EXAMPLE_URL; |
|
updatePreview(currentImage); |
|
}); |
|
|
|
uploadInput.addEventListener('change', (e) => { |
|
const file = e.target.files[0]; |
|
if (!file) return; |
|
|
|
const reader = new FileReader(); |
|
reader.onload = (e2) => { |
|
currentImage = e2.target.result; |
|
updatePreview(currentImage); |
|
}; |
|
reader.readAsDataURL(file); |
|
}); |
|
|
|
promptInput.addEventListener('keypress', (e) => { |
|
currentQuery = e.target.value; |
|
}); |
|
|
|
form.addEventListener('submit', (e) => { |
|
e.preventDefault(); |
|
|
|
if (!currentImage || !currentQuery) { |
|
loadModelButton.textContent = 'Please select an image and type a prompt'; |
|
setTimeout(() => { |
|
loadModelButton.textContent = 'Model Ready'; |
|
}, 2000); |
|
} else { |
|
promptInput.disabled = true; |
|
uploadInput.disabled = true; |
|
handleQuery(currentImage, currentQuery); |
|
} |
|
}); |
|
|