# Requirements.txt from torch import cuda from transformers import T5Tokenizer, T5ForConditionalGeneration import gradio as gr from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features import torch.nn as nn from PIL import Image, ImageDraw import pytesseract from tqdm.auto import tqdm import numpy as np import json import os import torch from torchvision import transforms # install PyTesseract os.system('pip install -q pytesseract') os.environ["TOKENIZERS_PARALLELISM"] = "false" # Default Library import # Visualization libraries # Specific libraries of LaTr # Setting the hyperparameters as well as primary configurations PAD_TOKEN_BOX = [0, 0, 0, 0] max_seq_len = 512 batch_size = 2 target_size = (500, 384) t5_model = "t5-base" device = 'cuda' if cuda.is_available() else 'cpu' # Configuration for the model config = { 't5_model': 't5-base', 'vocab_size': 32128, 'hidden_state': 768, 'max_2d_position_embeddings': 1001, 'classes': 32128, # number of tokens 'seq_len': 512 } tokenizer = T5Tokenizer.from_pretrained(t5_model) latr = LaTrForVQA(config) url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..2HGa6jqeAbugMJYxSkh7eA.XkaLSf8XlITet17Bscupegw9zWLw-IEizSy1lM-_PJF_Gfj-YuinOpDw4ad0M8r-s3WlnclQhHYrd2seaZVjBmkm5WSE6Dae1fW54dnNhyWF5w5O2VafNar7QSuUTSRzacJcmtqI1ypL3OZofwXuETbXq4weeqfDptFS5luxuV0P4Vaer_xEgfsdld6v8O5jjMXwb1CVmPCjMdZUE-HTgzTDiwv3Lb-P3dkRgU7q-iI5GeYZCODYGrX-koxya9DlfzKQZXmJmvtMj45vUZ8OSRB0_hTc7UosQanA-SalWznnOuyOgwl4hMag5toTomriWsxfvJIRBn9CYgFcvUJNqO_kDzBUoAwnagjcxXeEIJTJglwAl9Rs37XyfJAZr7yQ_YTXeRW1j2QMsT_M3qtS96IKRTpsqPVibl8Vrs9Q5g_vKccIQR9t7R9ma_DZLwjWYhDvDO06AZqtdaYGfWaOrbqe8dDvJkZoHsZEO8ukpIH6YNLyCO_dqgRsE77I9jqxiUqQh1KnuNv2hGRSlQR7u8OF7lpiRS7JEwj2MaxlzD58dyhOOLDqrbLp7XWrgV79EQcRYHFSMfhDvG0zmGvHjWGAg-LGhnYIc0NMVhyRv5Pfta9WYEl4qXxCTZWe4olgV79WHLqksQMVyTteheB36n4biHZKx4KZj7k-j3aSI72DIAvj7_UFeHxUTTZ1c6MB.7BF6J5MPMuhQFU48xVZ2qQ/models/epoch=0-step=34602.ckpt' try: latr = latr.load_from_checkpoint(url) print("Checkpoint loaded successfully") except: print("Checkpoint not loaded") pass image = gr.inputs.Image(type="pil") question = gr.inputs.Textbox(label="Question") answer = gr.outputs.Textbox(label="Predicted answer") examples = [["remote.jpg", "what number is the button near the top left?"]] from transformers import ViTFeatureExtractor, ViTModel vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k") import torchvision import numpy as np def answer_question(image, question): # Extracting features from the image image.save("sample.png") img, boxes, tokenized_words = create_features("sample.png", tokenizer=tokenizer, target_size=target_size, max_seq_length=max_seq_len, use_ocr=True ) ## Converting the boxes as per the format required for model input boxes = torch.as_tensor(boxes, dtype=torch.int32) width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1) height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1) boxes = torch.cat([boxes, width, height], axis = -1) ## Clamping the value,as some of the box values are out of bound boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0) boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000) boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000) boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0) boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000) boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000) ## Tensor tokenized words tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32) img = np.array(img) img = torchvision.transforms.ToTensor()(img) question = convert_ques_to_token(question = question, tokenizer = tokenizer) ## Expanding the dimension for inference boxes = boxes.unsqueeze(0) tokenized_words = tokenized_words.unsqueeze(0) question = question.unsqueeze(0) # print("Shape of Image is:", img.shape) img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values'] if int(len(img.shape)) == 3: img = img.unsqueeze(0) encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question} with torch.no_grad(): logits = latr.forward(encoding) logits = logits.squeeze(0) _, preds = torch.max(logits, dim = 1) preds = preds.detach().cpu() mask = torch.clamp(preds, min = 0, max = 1) last_non_zero_argument = (mask != 0).nonzero()[1][-1] predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer) return predicted_ans # Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py title = "Interactive demo: laTr (Layout Aware Transformer) for VQA" description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." article = "

LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA) | Github Repo

" examples = [['remote.png', "Is remote present in the picture?"]] interface = gr.Interface(fn=answer_question, inputs=[image, question], outputs=answer, examples=examples, title=title, description=description, article=article, enable_queue=True) interface.launch(debug=True)