ContextDet-Demo / app.py
yuhangzang's picture
Update app.py
7be5aa5
import os
os.system("python setup.py build develop --user")
import gradio as gr
from app_util import ContextDetDemo
header = '''
<div align=center>
<h1 style="font-weight: 900; margin-bottom: 7px;">
Contextual Object Detection with Multimodal Large Language Models
</h1>
</div>
'''
abstract = '''
πŸ€— This is the official Gradio demo for <b>Contextual Object Detection with Multimodal Large Language Models</b>.
πŸ†’ Our goal is to promote object detection with better `context understanding` and enable `interactive feedback`
through `human language vocabulary`, all made possible by using multimodal large language models!
🀝 This demo is still under construction. Your comments or suggestions are welcome!
⚑ For faster inference without waiting in the queue, you may duplicate the space and use the GPU setting:
<a href="https://huggingface.co/spaces/yuhangzang/ContextDet-Demo?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
<p/>
'''
footer = r'''
🦁 **Github Repo**
We would be grateful if you consider star our <a href="https://github.com/yuhangzang/ContextDET">github repo</a>
πŸ“ **Citation**
We would be grateful if you consider citing our work if you find it useful:
```bibtex
@article{zang2023contextual,
author = {Zang, Yuhang and Li, Wei and Han, Jun, and Zhou, Kaiyang and Loy, Chen Change},
title = {Contextual Object Detection with Multimodal Large Language Models},
journal = {arXiv preprint arXiv:2305.18279},
year = {2023}
}
```
πŸ“‹ **License**
This project is licensed under
<a rel="license" href="https://github.com/sczhou/CodeFormer/blob/master/LICENSE">S-Lab License 1.0</a>.
Redistribution and use for non-commercial purposes should follow this license.
πŸ“§ **Contact**
If you have any questions, please feel free to contact Yuhang Zang <b>(zang0012@ntu.edu.sg)</b>.
'''
css = '''
h1#title {
text-align: center;
}
'''
cloze_samples = [
["main_4.jpg", "A teacher is helping a <mask> with her homework at desk."],
["main_5.jpg", "A man crossing a busy <mask> with his <mask> up."],
]
captioning_samples = [
["main_1.jpg"],
["main_2.jpg"],
["main_4.jpg"],
["main_6.jpeg"],
]
qa_samples = [
["main_5.jpg", "What is his career?"],
["main_6.jpeg", "What are they doing?"],
]
contextdet_model = ContextDetDemo('./ckpt.pth')
def inference_fn_select(image_input, text_input, task_button, history=[]):
return contextdet_model.forward(image_input, text_input, task_button, history)
def set_cloze_samples(example: list) -> dict:
return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Cloze Test'
def set_captioning_samples(example: list) -> dict:
return gr.Image.update(example[0]), gr.Textbox.update(''), 'Captioning'
def set_qa_samples(example: list) -> dict:
return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Question Answering'
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown(header)
gr.Markdown(abstract)
state = gr.State([])
with gr.Row():
with gr.Column(scale=0.5, min_width=500):
image_input = gr.Image(type="pil", interactive=True, label="Upload an image πŸ“").style(height=250)
with gr.Column(scale=0.5, min_width=500):
chat_input = gr.Textbox(label="Type your text prompt ‡️")
task_button = gr.Radio(label="Contextual Task type", interactive=True,
choices=['Cloze Test', 'Captioning', 'Question Answering'],
value='Cloze Test')
with gr.Row():
submit_button = gr.Button(value="πŸƒ Run", interactive=True, variant="primary")
clear_button = gr.Button(value="πŸ”„ Clear", interactive=True)
with gr.Row():
with gr.Column(scale=0.5, min_width=500):
image_output = gr.Image(type='pil', interactive=False, label="Detection output")
with gr.Column(scale=0.5, min_width=500):
chat_output = gr.Chatbot(label="Text output").style(height=300)
with gr.Row():
with gr.Column(scale=0.33, min_width=330):
cloze_examples = gr.Dataset(
label='Contextual Cloze Test Examples',
components=[image_input, chat_input],
samples=cloze_samples,
)
with gr.Column(scale=0.33, min_width=330):
qa_examples = gr.Dataset(
label='Contextual Question Answering Examples',
components=[image_input, chat_input],
samples=qa_samples,
)
with gr.Column(scale=0.33, min_width=330):
captioning_examples = gr.Dataset(
label='Contextual Captioning Examples',
components=[image_input, ],
samples=captioning_samples,
)
submit_button.click(
inference_fn_select,
[image_input, chat_input, task_button, state],
[image_output, chat_output, state],
)
clear_button.click(
lambda: (None, None, "", [], [], 'Question Answering'),
[],
[image_input, image_output, chat_input, chat_output, state, task_button],
queue=False,
)
image_input.change(
lambda: (None, "", []),
[],
[image_output, chat_output, state],
queue=False,
)
cloze_examples.click(
fn=set_cloze_samples,
inputs=[cloze_examples],
outputs=[image_input, chat_input, task_button],
)
captioning_examples.click(
fn=set_captioning_samples,
inputs=[captioning_examples],
outputs=[image_input, chat_input, task_button],
)
qa_examples.click(
fn=set_qa_samples,
inputs=[qa_examples],
outputs=[image_input, chat_input, task_button],
)
gr.Markdown(footer)
demo.launch(enable_queue=True, share=False)
# demo.launch(enable_queue=True, share=True)