Spaces:

maxiw
/

Qwen2-VL-Detection

Running on Zero

App Files Files Community

maxiw commited on 26 days ago

Commit

161f68c

•

1 Parent(s): 4f3b26b

initial commit

Browse files

Files changed (2) hide show

app.py +76 -4
requirements.txt +8 -0

app.py CHANGED Viewed

@@ -1,7 +1,79 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+import torch
+from PIL import Image
+import subprocess
+models = {
+    "Qwen/Qwen2-VL-7B-Instruct": AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
+}
+processors = {
+    "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+}
+DESCRIPTION = "# Qwen2-VL Object Localization Demo"
+@spaces.GPU
+def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
+    model = models[model_id].eval().cuda()
+    processor = processors[model_id]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": f"Give a bounding box for {text_input}"},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text
+css = """
+  #output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Tab(label="Qwen2-VL Input"):
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Image(label="Input Picture")
+                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
+                text_input = gr.Textbox(label="Description of Localization Target")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])
+demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy==1.24.4
+Pillow==10.3.0
+Requests==2.31.0
+torch
+torchvision
+transformers==4.43.0
+accelerate==0.30.0
+qwen-vl-utils