# Requirements.txt import gradio as gr from transformers import T5Tokenizer, T5ForConditionalGeneration from torch import cuda from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features import torch.nn as nn from PIL import Image, ImageDraw import pytesseract import pandas as pd from tqdm.auto import tqdm import numpy as np import json import os from torchvision import transforms # install PyTesseract os.system('pip install -q pytesseract') os.environ["TOKENIZERS_PARALLELISM"] = "false" # Default Library import # For the purpose of displaying the progress of map function tqdm.pandas() # Visualization libraries # Specific libraries of LaTr # Setting the hyperparameters as well as primary configurations PAD_TOKEN_BOX = [0, 0, 0, 0] max_seq_len = 512 batch_size = 2 target_size = (500, 384) t5_model = "t5-base" device = 'cuda' if cuda.is_available() else 'cpu' # Configuration for the model config = { 't5_model': 't5-base', 'vocab_size': 32128, 'hidden_state': 768, 'max_2d_position_embeddings': 1001, 'classes': 32128, # number of tokens 'seq_len': 512 } tokenizer = T5Tokenizer.from_pretrained(t5_model) latr = LaTrForVQA(config, max_steps=max_steps) url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..tGHcmnLDazeyRNWAxV-KDQ.6unLNRwl7AyVy0Qz3ONE1m_mRNmgC-8VGyS61PdkSeBMV7PpG2B1cD5liuLlok5LQiYrGujrULdtIXKTqCUU_PA3MMSRhi1VKkGMdtrzJLMvzA4jxlWh_qak8P89w4ir4LENyuPCan24M0MOLXYjrm4d1iiy4Hg8pp2o5zWgs0OrVYoh_AJNazOD7pRIjLEAqnM-Pa0LSmvJkfN7j3Zn_Fu9jJ7Pq3Z0rWVtEb-PbeY06f9t-0QK6-JU8K2LdQjuBaCxjgB3BlufgFhKuhU3CZXsJitG7tDnwMSl4JImGfMmBntE2kn9-0dl_aANxaQd2Lsy8KGUDNAdQ2vBpowGQ0-tgDT_w7DpG6DzmUlmzIegqJF1-JyurCO0TrX_RatoPa7jGzuqA5vUT4263-MkoAlR0Xuulq4_pwGV-WnJsrcLuuDtEKFVsYjQvikWM3c9Arw0MsXchYCQkl_OZ6ZqYZ6TZrYxujHE2B6nHxu0F-5xj33vQ2ojaMpHtDplTnqCe4TdmzRWV6LhopfL4x1NXIXry8we4IqgPPwnIy3G2lZVR39nPmNR-8IGjbvweVr6Ci6y1COdbLR4JiTMVc_Nvf2glVKRjppTdcEwLv-j1YR8JsZpZvjaOEokrNkyCG7J0PLJAHlY8iX-pRdBG4vivbSHxnKl3Qppa689VH0RARpOsOBYv-IF-rM1nSmKq7Ci.tXi1B0oNQFlUtxesMcma3w/models/epoch=0-step=34602.ckpt' try: latr = latr.load_from_checkpoint(url) print("Checkpoint loaded successfully") except: print("Checkpoint not loaded") pass image = gr.inputs.Image(type="pil") question = gr.inputs.Textbox(label="Question") answer = gr.outputs.Textbox(label="Predicted answer") examples = [["remote.jpg", "what number is the button near the top left?"]] def answer_question(image, question): image.save('sample_img.jpg') # Extracting features from the image img, boxes, tokenized_words = create_features(image_path='sample_img.jpg', tokenizer=tokenizer, target_size=target_size, max_seq_length=max_seq_length, use_ocr=True ) ## Converting the boxes as per the format required for model input boxes = torch.as_tensor(boxes, dtype=torch.int32) width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1) height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1) boxes = torch.cat([boxes, width, height], axis = -1) ## Clamping the value,as some of the box values are out of bound boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0) boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000) boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000) boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0) boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000) boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000) ## Tensor tokenized words tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32) img = transforms.ToTensor()(img) question = convert_ques_to_token(question = question, tokenizer = tokenizer) ## Expanding the dimension for inference img = img.unsqueeze(0) boxes = boxes.unsqueeze(0) tokenized_words = tokenized_words.unsqueeze(0) question = question.unsqueeze(0) encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question} with torch.no_grad(): logits = latr.forward(encoding) logits = logits.squeeze(0) _, preds = torch.max(logits, dim = 1) preds = preds.detach().cpu() mask = torch.clamp(preds, min = 0, max = 1) last_non_zero_argument = (mask != 0).nonzero()[1][-1] predicted_ans = convert_token_to_ques(individual_ans_pred[:last_non_zero_argument], tokenizer) return predicted_ans # Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py title = "Interactive demo: laTr (Layout Aware Transformer) for VQA" description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." article = "
LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA) | Github Repo
" interface = gr.Interface(fn=answer_question, inputs=[image, question], outputs=answer, examples=examples, title=title, description=description, article=article, enable_queue=True) interface.launch(debug=True)