Spaces:

iakarshu
/

latr-vqa

Runtime error

App Files Files Community

latr-vqa / app.py

iakarshu

Update app.py

e9fc0b6 over 2 years ago

raw

history blame

6.02 kB

	# Requirements.txt
	import gradio as gr
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	from torch import cuda
	from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer
	from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA
	from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features
	import torch.nn as nn
	from PIL import Image, ImageDraw
	import pytesseract
	import pandas as pd
	from tqdm.auto import tqdm
	import numpy as np
	import json
	import os
	from torchvision import transforms



	# install PyTesseract
	os.system('pip install -q pytesseract')
	os.environ["TOKENIZERS_PARALLELISM"] = "false"


	# Default Library import


	# For the purpose of displaying the progress of map function
	tqdm.pandas()

	# Visualization libraries

	# Specific libraries of LaTr

	# Setting the hyperparameters as well as primary configurations

	PAD_TOKEN_BOX = [0, 0, 0, 0]
	max_seq_len = 512
	batch_size = 2
	target_size = (500, 384)
	t5_model = "t5-base"


	device = 'cuda' if cuda.is_available() else 'cpu'


	# Configuration for the model
	config = {
	't5_model': 't5-base',
	'vocab_size': 32128,
	'hidden_state': 768,
	'max_2d_position_embeddings': 1001,
	'classes': 32128, # number of tokens
	'seq_len': 512
	}

	tokenizer = T5Tokenizer.from_pretrained(t5_model)
	latr = LaTrForVQA(config, max_steps=max_steps)
	url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..tGHcmnLDazeyRNWAxV-KDQ.6unLNRwl7AyVy0Qz3ONE1m_mRNmgC-8VGyS61PdkSeBMV7PpG2B1cD5liuLlok5LQiYrGujrULdtIXKTqCUU_PA3MMSRhi1VKkGMdtrzJLMvzA4jxlWh_qak8P89w4ir4LENyuPCan24M0MOLXYjrm4d1iiy4Hg8pp2o5zWgs0OrVYoh_AJNazOD7pRIjLEAqnM-Pa0LSmvJkfN7j3Zn_Fu9jJ7Pq3Z0rWVtEb-PbeY06f9t-0QK6-JU8K2LdQjuBaCxjgB3BlufgFhKuhU3CZXsJitG7tDnwMSl4JImGfMmBntE2kn9-0dl_aANxaQd2Lsy8KGUDNAdQ2vBpowGQ0-tgDT_w7DpG6DzmUlmzIegqJF1-JyurCO0TrX_RatoPa7jGzuqA5vUT4263-MkoAlR0Xuulq4_pwGV-WnJsrcLuuDtEKFVsYjQvikWM3c9Arw0MsXchYCQkl_OZ6ZqYZ6TZrYxujHE2B6nHxu0F-5xj33vQ2ojaMpHtDplTnqCe4TdmzRWV6LhopfL4x1NXIXry8we4IqgPPwnIy3G2lZVR39nPmNR-8IGjbvweVr6Ci6y1COdbLR4JiTMVc_Nvf2glVKRjppTdcEwLv-j1YR8JsZpZvjaOEokrNkyCG7J0PLJAHlY8iX-pRdBG4vivbSHxnKl3Qppa689VH0RARpOsOBYv-IF-rM1nSmKq7Ci.tXi1B0oNQFlUtxesMcma3w/models/epoch=0-step=34602.ckpt'


	try:
	latr = latr.load_from_checkpoint(url)
	print("Checkpoint loaded successfully")
	except:
	print("Checkpoint not loaded")
	pass


	image = gr.inputs.Image(type="pil")
	question = gr.inputs.Textbox(label="Question")
	answer = gr.outputs.Textbox(label="Predicted answer")
	examples = [["remote.jpg", "what number is the button near the top left?"]]


	def answer_question(image, question):
	image.save('sample_img.jpg')

	# Extracting features from the image
	img, boxes, tokenized_words = create_features(image_path='sample_img.jpg',
	tokenizer=tokenizer,
	target_size=target_size,
	max_seq_length=max_seq_length,
	use_ocr=True
	)

	## Converting the boxes as per the format required for model input
	boxes = torch.as_tensor(boxes, dtype=torch.int32)
	width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1)
	height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1)
	boxes = torch.cat([boxes, width, height], axis = -1)

	## Clamping the value,as some of the box values are out of bound
	boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0)
	boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000)
	boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000)

	boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0)
	boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000)
	boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)

	## Tensor tokenized words
	tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)

	img = transforms.ToTensor()(img)
	question = convert_ques_to_token(question = question, tokenizer = tokenizer)

	## Expanding the dimension for inference
	img = img.unsqueeze(0)
	boxes = boxes.unsqueeze(0)
	tokenized_words = tokenized_words.unsqueeze(0)
	question = question.unsqueeze(0)

	encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}

	with torch.no_grad():
	logits = latr.forward(encoding)
	logits = logits.squeeze(0)

	_, preds = torch.max(logits, dim = 1)
	preds = preds.detach().cpu()
	mask = torch.clamp(preds, min = 0, max = 1)
	last_non_zero_argument = (mask != 0).nonzero()[1][-1]

	predicted_ans = convert_token_to_ques(individual_ans_pred[:last_non_zero_argument], tokenizer)
	return predicted_ans


	# Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py
	title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
	description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> \| <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"

	interface = gr.Interface(fn=answer_question,
	inputs=[image, question],
	outputs=answer,
	examples=examples,
	title=title,
	description=description,
	article=article,
	enable_queue=True)
	interface.launch(debug=True)