|
import gradio as gr |
|
from transformers import ViltProcessor, ViltForQuestionAnswering |
|
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering |
|
from PIL import Image |
|
import torch |
|
|
|
dataset_name = "Multimodal-Fatima/OK-VQA_train" |
|
original_model_name = "microsoft/git-base-vqav2" |
|
model_name = "hyo37009/git-vqa-finetuned-on-ok-vqa" |
|
model_path = "git-vqa-finetuned-on-ok-vqa" |
|
|
|
questions = ["What can happen the objects shown are thrown on the ground?", |
|
"What was the machine beside the bowl used for?", |
|
"What kind of cars are in the photo?", |
|
"What is the hairstyle of the blond called?", |
|
"How old do you have to be in canada to do this?", |
|
"Can you guess the place where the man is playing?", |
|
"What loony tune character is in this photo?", |
|
"Whose birthday is being celebrated?", |
|
"Where can that toilet seat be bought?", |
|
"What do you call the kind of pants that the man on the right is wearing?"] |
|
|
|
processor = AutoProcessor.from_pretrained(model_path) |
|
model = AutoModelForVisualQuestionAnswering.from_pretrained(model_path) |
|
|
|
|
|
def main(select_exemple_num): |
|
selectednum = select_exemple_num |
|
exemple_img = f"image{selectednum}.jpg" |
|
img = Image.open(exemple_img) |
|
question = questions[selectednum - 1] |
|
|
|
encoding = processor(img, question, return_tensors='pt') |
|
|
|
outputs = model(**encoding) |
|
logits = outputs.logits |
|
|
|
|
|
output_str = 'pridicted : \n' |
|
predicted_classes = torch.sigmoid(logits) |
|
|
|
probs, classes = torch.topk(predicted_classes, 5) |
|
ans = '' |
|
|
|
for prob, class_idx in zip(probs.squeeze().tolist(), classes.squeeze().tolist()): |
|
print(prob, model.config.id2label[class_idx]) |
|
output_str += str(prob) |
|
output_str += " " |
|
output_str += model.config.id2label[class_idx] |
|
output_str += "\n" |
|
if not ans: |
|
ans = model.config.id2label[class_idx] |
|
|
|
print(ans) |
|
|
|
output_str += f"\nso I think it's answer is : \n{ans}" |
|
|
|
return exemple_img, question, output_str |
|
|
|
|
|
demo = gr.Interface( |
|
fn=main, |
|
inputs=[gr.Slider(1, len(questions), step=1)], |
|
outputs=["image", "text", "text"], |
|
) |
|
|
|
demo.launch(share=True) |
|
|