Spaces:
Runtime error
Runtime error
# Requirements.txt | |
from torch import cuda | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
import gradio as gr | |
from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer | |
from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA | |
from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features | |
import torch.nn as nn | |
from PIL import Image, ImageDraw | |
import pytesseract | |
from tqdm.auto import tqdm | |
import numpy as np | |
import json | |
import os | |
import torch | |
from torchvision import transforms | |
# install PyTesseract | |
os.system('pip install -q pytesseract') | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# Default Library import | |
# Visualization libraries | |
# Specific libraries of LaTr | |
# Setting the hyperparameters as well as primary configurations | |
PAD_TOKEN_BOX = [0, 0, 0, 0] | |
max_seq_len = 512 | |
batch_size = 2 | |
target_size = (500, 384) | |
t5_model = "t5-base" | |
device = 'cuda' if cuda.is_available() else 'cpu' | |
# Configuration for the model | |
config = { | |
't5_model': 't5-base', | |
'vocab_size': 32128, | |
'hidden_state': 768, | |
'max_2d_position_embeddings': 1001, | |
'classes': 32128, # number of tokens | |
'seq_len': 512 | |
} | |
tokenizer = T5Tokenizer.from_pretrained(t5_model) | |
latr = LaTrForVQA(config) | |
url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..2HGa6jqeAbugMJYxSkh7eA.XkaLSf8XlITet17Bscupegw9zWLw-IEizSy1lM-_PJF_Gfj-YuinOpDw4ad0M8r-s3WlnclQhHYrd2seaZVjBmkm5WSE6Dae1fW54dnNhyWF5w5O2VafNar7QSuUTSRzacJcmtqI1ypL3OZofwXuETbXq4weeqfDptFS5luxuV0P4Vaer_xEgfsdld6v8O5jjMXwb1CVmPCjMdZUE-HTgzTDiwv3Lb-P3dkRgU7q-iI5GeYZCODYGrX-koxya9DlfzKQZXmJmvtMj45vUZ8OSRB0_hTc7UosQanA-SalWznnOuyOgwl4hMag5toTomriWsxfvJIRBn9CYgFcvUJNqO_kDzBUoAwnagjcxXeEIJTJglwAl9Rs37XyfJAZr7yQ_YTXeRW1j2QMsT_M3qtS96IKRTpsqPVibl8Vrs9Q5g_vKccIQR9t7R9ma_DZLwjWYhDvDO06AZqtdaYGfWaOrbqe8dDvJkZoHsZEO8ukpIH6YNLyCO_dqgRsE77I9jqxiUqQh1KnuNv2hGRSlQR7u8OF7lpiRS7JEwj2MaxlzD58dyhOOLDqrbLp7XWrgV79EQcRYHFSMfhDvG0zmGvHjWGAg-LGhnYIc0NMVhyRv5Pfta9WYEl4qXxCTZWe4olgV79WHLqksQMVyTteheB36n4biHZKx4KZj7k-j3aSI72DIAvj7_UFeHxUTTZ1c6MB.7BF6J5MPMuhQFU48xVZ2qQ/models/epoch=0-step=34602.ckpt' | |
try: | |
latr = latr.load_from_checkpoint(url) | |
print("Checkpoint loaded successfully") | |
except: | |
print("Checkpoint not loaded") | |
pass | |
image = gr.inputs.Image(type="pil") | |
question = gr.inputs.Textbox(label="Question") | |
answer = gr.outputs.Textbox(label="Predicted answer") | |
examples = [["remote.jpg", "what number is the button near the top left?"]] | |
from transformers import ViTFeatureExtractor, ViTModel | |
vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k") | |
import torchvision | |
import numpy as np | |
def answer_question(image, question): | |
# Extracting features from the image | |
image.save("sample.png") | |
img, boxes, tokenized_words = create_features("sample.png", | |
tokenizer=tokenizer, | |
target_size=target_size, | |
max_seq_length=max_seq_len, | |
use_ocr=True | |
) | |
## Converting the boxes as per the format required for model input | |
boxes = torch.as_tensor(boxes, dtype=torch.int32) | |
width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1) | |
height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1) | |
boxes = torch.cat([boxes, width, height], axis = -1) | |
## Clamping the value,as some of the box values are out of bound | |
boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0) | |
boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000) | |
boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000) | |
boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0) | |
boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000) | |
boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000) | |
## Tensor tokenized words | |
tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32) | |
img = np.array(img) | |
img = torchvision.transforms.ToTensor()(img) | |
question = convert_ques_to_token(question = question, tokenizer = tokenizer) | |
## Expanding the dimension for inference | |
boxes = boxes.unsqueeze(0) | |
tokenized_words = tokenized_words.unsqueeze(0) | |
question = question.unsqueeze(0) | |
# print("Shape of Image is:", img.shape) | |
img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values'] | |
if int(len(img.shape)) == 3: | |
img = img.unsqueeze(0) | |
encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question} | |
with torch.no_grad(): | |
logits = latr.forward(encoding) | |
logits = logits.squeeze(0) | |
_, preds = torch.max(logits, dim = 1) | |
preds = preds.detach().cpu() | |
mask = torch.clamp(preds, min = 0, max = 1) | |
last_non_zero_argument = (mask != 0).nonzero()[1][-1] | |
predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer) | |
return predicted_ans | |
# Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py | |
title = "Interactive demo: LaTr (Layout Aware Transformer) for VQA" | |
description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." | |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>" | |
examples = [['remote.png', "Is remote present in the picture?"]] | |
interface = gr.Interface(fn=answer_question, | |
inputs=[image, question], | |
outputs=answer, | |
examples=examples, | |
title=title, | |
description=description, | |
article=article, | |
enable_queue=True) | |
interface.launch(debug=True) | |