Spaces:

maxiw
/

Qwen2-VL-Detection

Running on Zero

App Files Files Community

maxiw commited on Sep 4, 2024

Commit

3ba2ba9

1 Parent(s): 0da7bd3

add example

Browse files

Files changed (2) hide show

app.py +17 -5
assets/image2.jpg +0 -0

app.py CHANGED Viewed

@@ -22,8 +22,8 @@ DESCRIPTION = "# Qwen2-VL Object Localization Demo"
 def image_to_base64(image):
     buffered = BytesIO()
-    image.save(buffered, format="PNG")  # Save the image in memory as PNG
-    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")  # Encode image to base64
     return img_str
@@ -101,21 +101,33 @@ css = """
     border: 1px solid #ccc;
   }
 """
 with gr.Blocks(css=css) as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Tab(label="Qwen2-VL Input"):
         with gr.Row():
             with gr.Column():
-                input_img = gr.Image(label="Input Picture", type="pil")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
-                system_prompt = gr.Textbox(label="System Prompt", value="You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
                 text_input = gr.Textbox(label="Description of Localization Target")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 model_output_text = gr.Textbox(label="Model Output Text")
                 parsed_boxes = gr.Textbox(label="Parsed Boxes")
-                annotated_image = gr.Image(label="Annotated Picture")
         submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector], [model_output_text, parsed_boxes, annotated_image])

 def image_to_base64(image):
     buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
     return img_str
     border: 1px solid #ccc;
   }
 """
+default_system_prompt = "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 with gr.Blocks(css=css) as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Tab(label="Qwen2-VL Input"):
         with gr.Row():
             with gr.Column():
+                input_img = gr.Image(label="Input Image", type="pil")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
+                system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
                 text_input = gr.Textbox(label="Description of Localization Target")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 model_output_text = gr.Textbox(label="Model Output Text")
                 parsed_boxes = gr.Textbox(label="Parsed Boxes")
+                annotated_image = gr.Image(label="Annotated Image")
+        gr.Examples(
+            examples=[
+                ["assets/image2.jpg", "orange button", default_system_prompt],
+            ],
+            inputs=[input_img, text_input, system_prompt],
+            outputs=[model_output_text, parsed_boxes, annotated_image],
+            fn=run_example,
+            cache_examples=True,
+            label="Try examples"
+        )
         submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector], [model_output_text, parsed_boxes, annotated_image])

assets/image2.jpg ADDED Viewed