from transformers import ViltProcessor, ViltForQuestionAnswering from PIL import Image # 470MB processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def model_pipeline(text: str, image: Image): # prepare inputs encoding = processor(image, text, return_tensors="pt") # forward pass outputs = model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() return model.config.id2label[idx]