File size: 741 Bytes
f098da2
 
 
64a6ef4
 
438833d
 
 
 
f098da2
438833d
 
 
 
 
 
 
8d14cb2
 
 
64a6ef4
f098da2
438833d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image


def image(url, text):
    image = Image.open(requests.get(url, stream=True).raw)
    processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
    model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

    # prepare inputs
    encoding = processor(image, text, return_tensors="pt")

    # forward pass
    outputs = model(**encoding)
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    print("question asked:", text)
    print("image link:", url)
    print("Predicted answer:", model.config.id2label[idx])
    return model.config.id2label[idx]

# prepare image + question