Spaces:
Runtime error
Runtime error
import os | |
from datetime import datetime | |
from numpy import true_divide | |
import gradio as gr | |
import warnings | |
warnings.filterwarnings("ignore") | |
os.system("python setup.py build develop --user") | |
from maskrcnn_benchmark.config import cfg | |
from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo | |
import vqa | |
import cv2 | |
from PIL import Image | |
import numpy as np | |
# Use this command for evaluate the GLIP-T model | |
config_file = "configs/glip_Swin_T_O365_GoldG.yaml" | |
weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth" | |
# manual override some options | |
cfg.local_rank = 0 | |
cfg.num_gpus = 1 | |
cfg.merge_from_file(config_file) | |
cfg.merge_from_list(["MODEL.WEIGHT", weight_file]) | |
cfg.merge_from_list(["MODEL.DEVICE", "cuda"]) | |
glip_demo = GLIPDemo( | |
cfg, | |
min_image_size=800, | |
confidence_threshold=0.7, | |
show_mask_heatmaps=False | |
) | |
blip_demo = vqa.VQA( | |
model_path = 'checkpoints/model_base_vqa_capfilt_large.pth') | |
def predict_image(image, object, question): | |
now = datetime.now() | |
dt_string = now.strftime("%d/%m/%Y %H:%M:%S") | |
print("TimeStamp {}".format(dt_string)) | |
result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5) | |
result = result[:, :, [2, 1, 0]] | |
answer = blip_demo.vqa_demo(image, question) | |
return result, answer | |
def predict_video(video, object, question, frame_drop_value): | |
now = datetime.now() | |
dt_string = now.strftime("%d/%m/%Y %H:%M:%S") | |
print("TimeStamp {}".format(dt_string)) | |
vid = cv2.VideoCapture(video) | |
count = 0 | |
while True: | |
ret, frame = vid.read() | |
if ret: | |
count+=1 | |
if count % frame_drop_value == 0: | |
# image = Image.fromarray(frame) | |
image = frame | |
cv2.putText( | |
img = image, | |
text = str(count), | |
org = (20, 20), | |
fontFace = cv2.FONT_HERSHEY_DUPLEX, | |
fontScale = 0.5, | |
color = (125, 246, 55), | |
thickness = 1) | |
result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5) | |
answer = blip_demo.vqa_demo(image, question) | |
yield result, answer | |
else: | |
break | |
yield result, answer | |
with gr.Blocks() as demo: | |
gr.Markdown("Text-Based Object Detection and Visual Question Answering") | |
with gr.Tab("Image"): | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image(label='input image') | |
obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..") | |
vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..") | |
image_button = gr.Button("Submit") | |
with gr.Column(): | |
image_output = gr.outputs.Image(type="pil", label="grounding results") | |
vqa_output = gr.Textbox(label="Answer") | |
with gr.Tab("Video"): | |
with gr.Row(): | |
with gr.Column(): | |
video_input = gr.PlayableVideo(label='input video', mirror_webcam=False) | |
obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..") | |
vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..") | |
frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5) | |
video_button = gr.Button("Submit") | |
with gr.Column(): | |
video_output = gr.outputs.Image(type="pil", label="grounding results") | |
vqa_output_video = gr.Textbox(label="Answer") | |
with gr.Tab("Webcam"): | |
with gr.Row(): | |
with gr.Column(): | |
cam_input = gr.Video(label='input video', mirror_webcam=False, source="webcam") | |
obj_input_cam = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..") | |
vqa_input_cam = gr.Textbox(label='Question', lines=1, placeholder="Question here..") | |
frame_drop_input_cam = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5) | |
cam_button = gr.Button("Submit") | |
with gr.Column(): | |
cam_output = gr.outputs.Image(type="pil", label="grounding results") | |
vqa_output_cam = gr.Textbox(label="Answer") | |
image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output]) | |
video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video]) | |
cam_button.click(predict_video, inputs=[cam_input, obj_input_cam, vqa_input_cam, frame_drop_input_cam], outputs=[cam_output, vqa_output_cam]) | |
demo.queue() | |
demo.launch() |