lang-ground / app_langloc.py
jing-bi's picture
Upload folder using huggingface_hub
917b125 verified
import gradio as gr
from langground import LangGround, text_palette
state = {"loc_model": None, "llm_model": None, "model": None}
def load_model(loc_model: str, llm_model: str) -> LangGround:
if (loc_model, llm_model) != (state["loc_model"], state["llm_model"]):
gr.Info("Loading models...", duration=5)
state.update({"model": LangGround(loc_model=loc_model, llm_model=llm_model), "loc_model": loc_model, "llm_model": llm_model})
gr.Info("Models loaded!", duration=2.5)
return state["model"]
def predict(frame, question: str, threshold: float, loc_model: str, llm_model: str):
if not frame or not question.strip():
gr.Warning("Please provide both an image and a question")
return "", None, None
model = load_model(loc_model, llm_model)
return model.localize(frame, question, threshold=threshold)
title = """
<center>
<h1> πŸ” Language Localization </h1>
<b> Upload an image and ask questions to find objects in it. <b>
</center>
"""
with gr.Blocks() as demo:
gr.HTML(title)
with gr.Row():
with gr.Column(scale=1):
frame_input = gr.Image(type="pil", label="Upload Frame")
with gr.Column(scale=1):
with gr.Row():
with gr.Column(scale=1):
loc_model_input = gr.Dropdown(
choices=["yolo", "owl"],
value="yolo",
label="Localization Model",
)
with gr.Column(scale=2):
llm_model_input = gr.Dropdown(
choices=[
"Qwen/Qwen2.5-7B-Instruct",
"OpenGVLab/InternVL2_5-8B",
"OpenGVLab/InternVL2_5-4B",
"OpenGVLab/InternVL2_5-2B",
"OpenGVLab/InternVL2_5-1B",
],
value="Qwen/Qwen2.5-7B-Instruct",
label="LLM Model",
)
threshold_input = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Threshold")
question_input = gr.Textbox(lines=2, placeholder="Enter your question here", label="Question")
objs = gr.Highlightedtext(show_legend=False, show_inline_category=False, color_map=text_palette, label="Objects Found")
submit_btn = gr.Button("Submit")
with gr.Row():
all_bbox_image = gr.Image(label="Found Objects")
llm_bbox_image = gr.Image(label="Selected Objects")
submit_btn.click(
fn=predict,
inputs=[frame_input, question_input, threshold_input, loc_model_input, llm_model_input],
outputs=[objs, all_bbox_image, llm_bbox_image],
)
examples = gr.Examples(
examples=[
["assets/demo.jpeg", "I'm thirsty"],
["assets/kitchen.webp", "The food has expired and is no longer safe to eat."],
["assets/kitchen.webp", "The food is about to expire."],
],
inputs=[frame_input, question_input],
)
if __name__ == "__main__":
demo.launch()