|
import os |
|
os.system("python setup.py build develop --user") |
|
|
|
import gradio as gr |
|
|
|
from app_util import ContextDetDemo |
|
|
|
header = ''' |
|
<div align=center> |
|
<h1 style="font-weight: 900; margin-bottom: 7px;"> |
|
Contextual Object Detection with Multimodal Large Language Models |
|
</h1> |
|
</div> |
|
''' |
|
|
|
abstract = ''' |
|
π€ This is the official Gradio demo for <b>Contextual Object Detection with Multimodal Large Language Models</b>. |
|
|
|
π Our goal is to promote object detection with better `context understanding` and enable `interactive feedback` |
|
through `human language vocabulary`, all made possible by using multimodal large language models! |
|
|
|
π€ This demo is still under construction. Your comments or suggestions are welcome! |
|
|
|
β‘ For faster inference without waiting in the queue, you may duplicate the space and use the GPU setting: |
|
<a href="https://huggingface.co/spaces/yuhangzang/ContextDet-Demo?duplicate=true"> |
|
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> |
|
<p/> |
|
''' |
|
|
|
footer = r''' |
|
π¦ **Github Repo** |
|
We would be grateful if you consider star our <a href="https://github.com/yuhangzang/ContextDET">github repo</a> |
|
|
|
π **Citation** |
|
We would be grateful if you consider citing our work if you find it useful: |
|
```bibtex |
|
@article{zang2023contextual, |
|
author = {Zang, Yuhang and Li, Wei and Han, Jun, and Zhou, Kaiyang and Loy, Chen Change}, |
|
title = {Contextual Object Detection with Multimodal Large Language Models}, |
|
journal = {arXiv preprint arXiv:2305.18279}, |
|
year = {2023} |
|
} |
|
``` |
|
|
|
π **License** |
|
This project is licensed under |
|
<a rel="license" href="https://github.com/sczhou/CodeFormer/blob/master/LICENSE">S-Lab License 1.0</a>. |
|
Redistribution and use for non-commercial purposes should follow this license. |
|
|
|
π§ **Contact** |
|
If you have any questions, please feel free to contact Yuhang Zang <b>(zang0012@ntu.edu.sg)</b>. |
|
''' |
|
|
|
css = ''' |
|
h1#title { |
|
text-align: center; |
|
} |
|
''' |
|
|
|
cloze_samples = [ |
|
["main_4.jpg", "A teacher is helping a <mask> with her homework at desk."], |
|
["main_5.jpg", "A man crossing a busy <mask> with his <mask> up."], |
|
] |
|
|
|
|
|
captioning_samples = [ |
|
["main_1.jpg"], |
|
["main_2.jpg"], |
|
["main_4.jpg"], |
|
["main_6.jpeg"], |
|
] |
|
|
|
qa_samples = [ |
|
["main_5.jpg", "What is his career?"], |
|
["main_6.jpeg", "What are they doing?"], |
|
] |
|
|
|
contextdet_model = ContextDetDemo('./ckpt.pth') |
|
|
|
|
|
def inference_fn_select(image_input, text_input, task_button, history=[]): |
|
return contextdet_model.forward(image_input, text_input, task_button, history) |
|
|
|
|
|
def set_cloze_samples(example: list) -> dict: |
|
return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Cloze Test' |
|
|
|
|
|
def set_captioning_samples(example: list) -> dict: |
|
return gr.Image.update(example[0]), gr.Textbox.update(''), 'Captioning' |
|
|
|
|
|
def set_qa_samples(example: list) -> dict: |
|
return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Question Answering' |
|
|
|
|
|
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(header) |
|
gr.Markdown(abstract) |
|
state = gr.State([]) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=0.5, min_width=500): |
|
image_input = gr.Image(type="pil", interactive=True, label="Upload an image π").style(height=250) |
|
with gr.Column(scale=0.5, min_width=500): |
|
chat_input = gr.Textbox(label="Type your text prompt ‡οΈ") |
|
task_button = gr.Radio(label="Contextual Task type", interactive=True, |
|
choices=['Cloze Test', 'Captioning', 'Question Answering'], |
|
value='Cloze Test') |
|
with gr.Row(): |
|
submit_button = gr.Button(value="π Run", interactive=True, variant="primary") |
|
clear_button = gr.Button(value="π Clear", interactive=True) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=0.5, min_width=500): |
|
image_output = gr.Image(type='pil', interactive=False, label="Detection output") |
|
with gr.Column(scale=0.5, min_width=500): |
|
chat_output = gr.Chatbot(label="Text output").style(height=300) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=0.33, min_width=330): |
|
cloze_examples = gr.Dataset( |
|
label='Contextual Cloze Test Examples', |
|
components=[image_input, chat_input], |
|
samples=cloze_samples, |
|
) |
|
with gr.Column(scale=0.33, min_width=330): |
|
qa_examples = gr.Dataset( |
|
label='Contextual Question Answering Examples', |
|
components=[image_input, chat_input], |
|
samples=qa_samples, |
|
) |
|
with gr.Column(scale=0.33, min_width=330): |
|
captioning_examples = gr.Dataset( |
|
label='Contextual Captioning Examples', |
|
components=[image_input, ], |
|
samples=captioning_samples, |
|
) |
|
|
|
submit_button.click( |
|
inference_fn_select, |
|
[image_input, chat_input, task_button, state], |
|
[image_output, chat_output, state], |
|
) |
|
clear_button.click( |
|
lambda: (None, None, "", [], [], 'Question Answering'), |
|
[], |
|
[image_input, image_output, chat_input, chat_output, state, task_button], |
|
queue=False, |
|
) |
|
image_input.change( |
|
lambda: (None, "", []), |
|
[], |
|
[image_output, chat_output, state], |
|
queue=False, |
|
) |
|
cloze_examples.click( |
|
fn=set_cloze_samples, |
|
inputs=[cloze_examples], |
|
outputs=[image_input, chat_input, task_button], |
|
) |
|
captioning_examples.click( |
|
fn=set_captioning_samples, |
|
inputs=[captioning_examples], |
|
outputs=[image_input, chat_input, task_button], |
|
) |
|
qa_examples.click( |
|
fn=set_qa_samples, |
|
inputs=[qa_examples], |
|
outputs=[image_input, chat_input, task_button], |
|
) |
|
|
|
gr.Markdown(footer) |
|
|
|
demo.launch(enable_queue=True, share=False) |
|
|
|
|