Spaces:
Runtime error
Runtime error
File size: 4,862 Bytes
128757a 33eef8f 70ba8fd 128757a 70ba8fd 128757a 503648b 128757a 70ba8fd 9eaaad8 128757a 70ba8fd 128757a 70ba8fd 33eef8f 70ba8fd b80bbba 70ba8fd 2c37148 5686c54 2c37148 70ba8fd 2c37148 70ba8fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
from datetime import datetime
from numpy import true_divide
import gradio as gr
import warnings
warnings.filterwarnings("ignore")
os.system("python setup.py build develop --user")
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
import vqa
import cv2
from PIL import Image
import numpy as np
# Use this command for evaluate the GLIP-T model
config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"
# manual override some options
cfg.local_rank = 0
cfg.num_gpus = 1
cfg.merge_from_file(config_file)
cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
glip_demo = GLIPDemo(
cfg,
min_image_size=800,
confidence_threshold=0.7,
show_mask_heatmaps=False
)
blip_demo = vqa.VQA(
model_path = 'checkpoints/model_base_vqa_capfilt_large.pth')
def predict_image(image, object, question):
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("TimeStamp {}".format(dt_string))
result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
result = result[:, :, [2, 1, 0]]
answer = blip_demo.vqa_demo(image, question)
return result, answer
def predict_video(video, object, question, frame_drop_value):
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("TimeStamp {}".format(dt_string))
vid = cv2.VideoCapture(video)
count = 0
result = None
answer = None
while True:
ret, frame = vid.read()
if ret:
count+=1
if count % frame_drop_value == 0:
# image = Image.fromarray(frame)
image = frame
cv2.putText(
img = image,
text = str(count),
org = (20, 20),
fontFace = cv2.FONT_HERSHEY_DUPLEX,
fontScale = 0.5,
color = (125, 246, 55),
thickness = 1)
result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
answer = blip_demo.vqa_demo(image, question)
yield result, answer
else:
break
yield result, answer
with gr.Blocks() as demo:
gr.Markdown("Text-Based Object Detection and Visual Question Answering")
with gr.Tab("Image"):
with gr.Row():
with gr.Column():
image_input = gr.Image(label='input image')
obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
image_button = gr.Button("Submit")
with gr.Column():
image_output = gr.outputs.Image(type="pil", label="grounding results")
vqa_output = gr.Textbox(label="Answer")
with gr.Tab("Video"):
with gr.Row():
with gr.Column():
video_input = gr.PlayableVideo(label='input video', mirror_webcam=False)
obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
video_button = gr.Button("Submit")
with gr.Column():
video_output = gr.outputs.Image(type="pil", label="grounding results")
vqa_output_video = gr.Textbox(label="Answer")
with gr.Tab("Webcam"):
with gr.Row():
with gr.Column():
cam_input = gr.Video(label='input video', mirror_webcam=False, source="webcam")
obj_input_cam = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
vqa_input_cam = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
frame_drop_input_cam = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
cam_button = gr.Button("Submit")
with gr.Column():
cam_output = gr.outputs.Image(type="pil", label="grounding results")
vqa_output_cam = gr.Textbox(label="Answer")
image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output])
video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video])
cam_button.click(predict_video, inputs=[cam_input, obj_input_cam, vqa_input_cam, frame_drop_input_cam], outputs=[cam_output, vqa_output_cam])
demo.queue()
demo.launch() |