maxiw commited on
Commit
161f68c
1 Parent(s): 4f3b26b

initial commit

Browse files
Files changed (2) hide show
  1. app.py +76 -4
  2. requirements.txt +8 -0
app.py CHANGED
@@ -1,7 +1,79 @@
1
  import gradio as gr
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import spaces
3
+ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
4
+ import torch
5
+ from PIL import Image
6
+ import subprocess
7
 
 
 
8
 
9
+ models = {
10
+ "Qwen/Qwen2-VL-7B-Instruct": AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
11
+ }
12
+
13
+ processors = {
14
+ "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
15
+ }
16
+
17
+ DESCRIPTION = "# Qwen2-VL Object Localization Demo"
18
+
19
+
20
+ @spaces.GPU
21
+ def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
22
+ model = models[model_id].eval().cuda()
23
+ processor = processors[model_id]
24
+
25
+ messages = [
26
+ {
27
+ "role": "user",
28
+ "content": [
29
+ {"type": "image", "image": image},
30
+ {"type": "text", "text": f"Give a bounding box for {text_input}"},
31
+ ],
32
+ }
33
+ ]
34
+
35
+ text = processor.apply_chat_template(
36
+ messages, tokenize=False, add_generation_prompt=True
37
+ )
38
+ image_inputs, video_inputs = process_vision_info(messages)
39
+ inputs = processor(
40
+ text=[text],
41
+ images=image_inputs,
42
+ videos=video_inputs,
43
+ padding=True,
44
+ return_tensors="pt",
45
+ )
46
+ inputs = inputs.to("cuda")
47
+
48
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
49
+ generated_ids_trimmed = [
50
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
51
+ ]
52
+ output_text = processor.batch_decode(
53
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
54
+ )
55
+ return output_text
56
+
57
+ css = """
58
+ #output {
59
+ height: 500px;
60
+ overflow: auto;
61
+ border: 1px solid #ccc;
62
+ }
63
+ """
64
+
65
+ with gr.Blocks(css=css) as demo:
66
+ gr.Markdown(DESCRIPTION)
67
+ with gr.Tab(label="Qwen2-VL Input"):
68
+ with gr.Row():
69
+ with gr.Column():
70
+ input_img = gr.Image(label="Input Picture")
71
+ model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
72
+ text_input = gr.Textbox(label="Description of Localization Target")
73
+ submit_btn = gr.Button(value="Submit")
74
+ with gr.Column():
75
+ output_text = gr.Textbox(label="Output Text")
76
+
77
+ submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])
78
+
79
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy==1.24.4
2
+ Pillow==10.3.0
3
+ Requests==2.31.0
4
+ torch
5
+ torchvision
6
+ transformers==4.43.0
7
+ accelerate==0.30.0
8
+ qwen-vl-utils