Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

App Files Files Community

GLIP-BLIP-Object-Detection-VQA / app.py

Pinwheel

Update app.py

b80bbba over 1 year ago

raw

history blame

4.86 kB

	import os
	from datetime import datetime

	from numpy import true_divide
	import gradio as gr
	import warnings

	warnings.filterwarnings("ignore")

	os.system("python setup.py build develop --user")

	from maskrcnn_benchmark.config import cfg
	from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
	import vqa
	import cv2
	from PIL import Image
	import numpy as np

	# Use this command for evaluate the GLIP-T model
	config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
	weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"

	# manual override some options
	cfg.local_rank = 0
	cfg.num_gpus = 1
	cfg.merge_from_file(config_file)
	cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
	cfg.merge_from_list(["MODEL.DEVICE", "cuda"])

	glip_demo = GLIPDemo(
	cfg,
	min_image_size=800,
	confidence_threshold=0.7,
	show_mask_heatmaps=False
	)
	blip_demo = vqa.VQA(
	model_path = 'checkpoints/model_base_vqa_capfilt_large.pth')

	def predict_image(image, object, question):
	now = datetime.now()
	dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
	print("TimeStamp {}".format(dt_string))
	result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
	result = result[:, :, [2, 1, 0]]
	answer = blip_demo.vqa_demo(image, question)
	return result, answer

	def predict_video(video, object, question, frame_drop_value):
	now = datetime.now()
	dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
	print("TimeStamp {}".format(dt_string))
	vid = cv2.VideoCapture(video)
	count = 0
	result = None
	answer = None
	while True:
	ret, frame = vid.read()
	if ret:
	count+=1
	if count % frame_drop_value == 0:
	# image = Image.fromarray(frame)
	image = frame
	cv2.putText(
	img = image,
	text = str(count),
	org = (20, 20),
	fontFace = cv2.FONT_HERSHEY_DUPLEX,
	fontScale = 0.5,
	color = (125, 246, 55),
	thickness = 1)
	result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
	answer = blip_demo.vqa_demo(image, question)
	yield result, answer
	else:
	break

	yield result, answer

	with gr.Blocks() as demo:
	gr.Markdown("Text-Based Object Detection and Visual Question Answering")
	with gr.Tab("Image"):
	with gr.Row():
	with gr.Column():
	image_input = gr.Image(label='input image')
	obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
	vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
	image_button = gr.Button("Submit")

	with gr.Column():
	image_output = gr.outputs.Image(type="pil", label="grounding results")
	vqa_output = gr.Textbox(label="Answer")

	with gr.Tab("Video"):
	with gr.Row():
	with gr.Column():
	video_input = gr.PlayableVideo(label='input video', mirror_webcam=False)
	obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
	vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
	frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
	video_button = gr.Button("Submit")

	with gr.Column():
	video_output = gr.outputs.Image(type="pil", label="grounding results")
	vqa_output_video = gr.Textbox(label="Answer")

	with gr.Tab("Webcam"):
	with gr.Row():
	with gr.Column():
	cam_input = gr.Video(label='input video', mirror_webcam=False, source="webcam")
	obj_input_cam = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
	vqa_input_cam = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
	frame_drop_input_cam = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
	cam_button = gr.Button("Submit")

	with gr.Column():
	cam_output = gr.outputs.Image(type="pil", label="grounding results")
	vqa_output_cam = gr.Textbox(label="Answer")

	image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output])
	video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video])
	cam_button.click(predict_video, inputs=[cam_input, obj_input_cam, vqa_input_cam, frame_drop_input_cam], outputs=[cam_output, vqa_output_cam])
	demo.queue()
	demo.launch()