Pinwheel commited on
Commit
70ba8fd
1 Parent(s): adae990

Add video demo

Browse files
Files changed (1) hide show
  1. app.py +66 -23
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
  import gradio as gr
3
  import warnings
4
 
@@ -9,7 +11,9 @@ os.system("python setup.py build develop --user")
9
  from maskrcnn_benchmark.config import cfg
10
  from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
11
  import vqa
12
- import vqa
 
 
13
 
14
  # Use this command for evaluate the GLIP-T model
15
  config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
@@ -30,28 +34,67 @@ glip_demo = GLIPDemo(
30
  )
31
  blip_demo = vqa.VQA(
32
  model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
33
- )
34
 
35
- def predict(image, object, question):
36
  result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
 
37
  answer = blip_demo.vqa_demo(image, question)
38
- return result[:, :, [2, 1, 0]], answer
39
-
40
- image = gr.inputs.Image()
41
-
42
- gr.Interface(
43
- description="GLIP + BLIP VQA Demo.",
44
- fn=predict,
45
- inputs=[
46
- "image",
47
- gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."),
48
- gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
49
-
50
- outputs=[
51
- gr.outputs.Image(
52
- type="pil",
53
- label="grounding results"
54
- ),
55
- gr.Textbox(label="Answer")
56
- ],
57
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+
3
+ from numpy import true_divide
4
  import gradio as gr
5
  import warnings
6
 
 
11
  from maskrcnn_benchmark.config import cfg
12
  from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
13
  import vqa
14
+ import cv2
15
+ from PIL import Image
16
+ import numpy as np
17
 
18
  # Use this command for evaluate the GLIP-T model
19
  config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
 
34
  )
35
  blip_demo = vqa.VQA(
36
  model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
 
37
 
38
+ def predict_image(image, object, question):
39
  result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
40
+ result = result[:, :, [2, 1, 0]]
41
  answer = blip_demo.vqa_demo(image, question)
42
+ return result, answer
43
+
44
+ def predict_video(video, object, question, frame_drop_value):
45
+ vid = cv2.VideoCapture(video)
46
+ count = 0
47
+ while True:
48
+ ret, frame = vid.read()
49
+ if ret:
50
+ count+=1
51
+ if count % frame_drop_value == 0:
52
+ # image = Image.fromarray(frame)
53
+ image = frame
54
+ cv2.putText(
55
+ img = image,
56
+ text = str(count),
57
+ org = (20, 20),
58
+ fontFace = cv2.FONT_HERSHEY_DUPLEX,
59
+ fontScale = 0.5,
60
+ color = (125, 246, 55),
61
+ thickness = 1)
62
+ result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
63
+ answer = blip_demo.vqa_demo(image, question)
64
+ yield result, answer
65
+ else:
66
+ break
67
+
68
+ yield result, answer
69
+
70
+ with gr.Blocks() as demo:
71
+ gr.Markdown("Text-Based Object Detection and Visual Question Answering")
72
+ with gr.Tab("Image"):
73
+ with gr.Row():
74
+ with gr.Column():
75
+ image_input = gr.Image(label='input image')
76
+ obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
77
+ vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
78
+ image_button = gr.Button("Submit")
79
+
80
+ with gr.Column():
81
+ image_output = gr.outputs.Image(type="pil", label="grounding results")
82
+ vqa_output = gr.Textbox(label="Answer")
83
+
84
+ with gr.Tab("Video"):
85
+ with gr.Row():
86
+ with gr.Column():
87
+ video_input = gr.PlayableVideo(label='input video', mirror_webcam=False)
88
+ obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
89
+ vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
90
+ frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
91
+ video_button = gr.Button("Submit")
92
+
93
+ with gr.Column():
94
+ video_output = gr.outputs.Image(type="pil", label="grounding results")
95
+ vqa_output_video = gr.Textbox(label="Answer")
96
+
97
+ image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output])
98
+ video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video])
99
+ demo.queue()
100
+ demo.launch()