# Requirements.txt from torch import cuda from transformers import T5Tokenizer, T5ForConditionalGeneration import gradio as gr from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features import torch.nn as nn from PIL import Image, ImageDraw import pytesseract from tqdm.auto import tqdm import numpy as np import json import os import torch from torchvision import transforms # install PyTesseract os.system('pip install -q pytesseract') os.environ["TOKENIZERS_PARALLELISM"] = "false" # Default Library import # Visualization libraries # Specific libraries of LaTr # Setting the hyperparameters as well as primary configurations PAD_TOKEN_BOX = [0, 0, 0, 0] max_seq_len = 512 batch_size = 2 target_size = (500, 384) t5_model = "t5-base" device = 'cuda' if cuda.is_available() else 'cpu' # Configuration for the model config = { 't5_model': 't5-base', 'vocab_size': 32128, 'hidden_state': 768, 'max_2d_position_embeddings': 1001, 'classes': 32128, # number of tokens 'seq_len': 512 } tokenizer = T5Tokenizer.from_pretrained(t5_model) latr = LaTrForVQA(config) url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..tGHcmnLDazeyRNWAxV-KDQ.6unLNRwl7AyVy0Qz3ONE1m_mRNmgC-8VGyS61PdkSeBMV7PpG2B1cD5liuLlok5LQiYrGujrULdtIXKTqCUU_PA3MMSRhi1VKkGMdtrzJLMvzA4jxlWh_qak8P89w4ir4LENyuPCan24M0MOLXYjrm4d1iiy4Hg8pp2o5zWgs0OrVYoh_AJNazOD7pRIjLEAqnM-Pa0LSmvJkfN7j3Zn_Fu9jJ7Pq3Z0rWVtEb-PbeY06f9t-0QK6-JU8K2LdQjuBaCxjgB3BlufgFhKuhU3CZXsJitG7tDnwMSl4JImGfMmBntE2kn9-0dl_aANxaQd2Lsy8KGUDNAdQ2vBpowGQ0-tgDT_w7DpG6DzmUlmzIegqJF1-JyurCO0TrX_RatoPa7jGzuqA5vUT4263-MkoAlR0Xuulq4_pwGV-WnJsrcLuuDtEKFVsYjQvikWM3c9Arw0MsXchYCQkl_OZ6ZqYZ6TZrYxujHE2B6nHxu0F-5xj33vQ2ojaMpHtDplTnqCe4TdmzRWV6LhopfL4x1NXIXry8we4IqgPPwnIy3G2lZVR39nPmNR-8IGjbvweVr6Ci6y1COdbLR4JiTMVc_Nvf2glVKRjppTdcEwLv-j1YR8JsZpZvjaOEokrNkyCG7J0PLJAHlY8iX-pRdBG4vivbSHxnKl3Qppa689VH0RARpOsOBYv-IF-rM1nSmKq7Ci.tXi1B0oNQFlUtxesMcma3w/models/epoch=0-step=34602.ckpt' try: latr = latr.load_from_checkpoint(url) print("Checkpoint loaded successfully") except: print("Checkpoint not loaded") pass image = gr.inputs.Image(type="pil") question = gr.inputs.Textbox(label="Question") answer = gr.outputs.Textbox(label="Predicted answer") examples = [["remote.jpg", "what number is the button near the top left?"]] from transformers import ViTFeatureExtractor, ViTModel vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k") import torchvision import numpy as np def answer_question(image, question): # Extracting features from the image image.save("sample.png") img, boxes, tokenized_words = create_features("sample.png", tokenizer=tokenizer, target_size=target_size, max_seq_length=max_seq_len, use_ocr=True ) ## Converting the boxes as per the format required for model input boxes = torch.as_tensor(boxes, dtype=torch.int32) width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1) height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1) boxes = torch.cat([boxes, width, height], axis = -1) ## Clamping the value,as some of the box values are out of bound boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0) boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000) boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000) boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0) boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000) boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000) ## Tensor tokenized words tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32) img = np.array(img) img = torchvision.transforms.ToTensor()(img) question = convert_ques_to_token(question = question, tokenizer = tokenizer) ## Expanding the dimension for inference boxes = boxes.unsqueeze(0) tokenized_words = tokenized_words.unsqueeze(0) question = question.unsqueeze(0) img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values'] if int(len(img.shape)) == 3: img = img.unsqueeze(0) encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question} with torch.no_grad(): logits = latr.forward(encoding) logits = logits.squeeze(0) _, preds = torch.max(logits, dim = 1) preds = preds.detach().cpu() mask = torch.clamp(preds, min = 0, max = 1) last_non_zero_argument = (mask != 0).nonzero()[1][-1] predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer) return predicted_ans # Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py title = "Interactive demo: laTr (Layout Aware Transformer) for VQA" description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." article = "
LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA) | Github Repo
" examples = [['remote.png', "what number is the button near the top left?"]] interface = gr.Interface(fn=answer_question, inputs=[image, question], outputs=answer, examples=examples, title=title, description=description, article=article, enable_queue=True) interface.launch(debug=True)