Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

App Files Files Community

Pinwheel commited on Oct 21, 2022

Commit

70ba8fd

•

1 Parent(s): adae990

Add video demo

Browse files

Files changed (1) hide show

app.py +66 -23

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 import gradio as gr
 import warnings
@@ -9,7 +11,9 @@ os.system("python setup.py build develop --user")
 from maskrcnn_benchmark.config import cfg
 from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
 import vqa
-import vqa
 # Use this command for evaluate the GLIP-T model
 config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
@@ -30,28 +34,67 @@ glip_demo = GLIPDemo(
 )
 blip_demo = vqa.VQA(
     model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
-)
-def predict(image, object, question):
     result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
     answer = blip_demo.vqa_demo(image, question)
-    return result[:, :, [2, 1, 0]], answer
-image = gr.inputs.Image()
-gr.Interface(
-    description="GLIP + BLIP VQA Demo.",
-    fn=predict,
-    inputs=[
-        "image",
-        gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."),
-        gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
-    outputs=[
-        gr.outputs.Image(
-            type="pil",
-            label="grounding results"
-        ),
-        gr.Textbox(label="Answer")
-    ],
-).launch()

 import os
+from numpy import true_divide
 import gradio as gr
 import warnings
 from maskrcnn_benchmark.config import cfg
 from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
 import vqa
+import cv2
+from PIL import Image
+import numpy as np
 # Use this command for evaluate the GLIP-T model
 config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
 )
 blip_demo = vqa.VQA(
     model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
+def predict_image(image, object, question):
     result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+    result = result[:, :, [2, 1, 0]]
     answer = blip_demo.vqa_demo(image, question)
+    return result, answer
+def predict_video(video, object, question, frame_drop_value):
+    vid = cv2.VideoCapture(video)
+    count = 0
+    while True:
+        ret, frame = vid.read()
+        if ret:
+            count+=1
+            if count % frame_drop_value == 0:
+                # image = Image.fromarray(frame)
+                image = frame
+                cv2.putText(
+                img = image,
+                text = str(count),
+                org = (20, 20),
+                fontFace = cv2.FONT_HERSHEY_DUPLEX,
+                fontScale = 0.5,
+                color = (125, 246, 55),
+                thickness = 1)
+                result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+                answer = blip_demo.vqa_demo(image, question)
+                yield result, answer
+        else:
+            break
+    yield result, answer
+with gr.Blocks() as demo:
+    gr.Markdown("Text-Based Object Detection and Visual Question Answering")
+    with gr.Tab("Image"):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label='input image')
+                obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
+                vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
+                image_button = gr.Button("Submit")
+            with gr.Column():
+                image_output = gr.outputs.Image(type="pil", label="grounding results")
+                vqa_output = gr.Textbox(label="Answer")
+    with gr.Tab("Video"):
+        with gr.Row():
+            with gr.Column():
+                video_input = gr.PlayableVideo(label='input video', mirror_webcam=False)
+                obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
+                vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
+                frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
+                video_button = gr.Button("Submit")
+            with gr.Column():
+                video_output = gr.outputs.Image(type="pil", label="grounding results")
+                vqa_output_video = gr.Textbox(label="Answer")
+    image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output])
+    video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video])
+demo.queue()
+demo.launch()