justinj92 commited on
Commit
51a75e5
·
verified ·
1 Parent(s): c6a0eef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -170
app.py CHANGED
@@ -1,171 +1,171 @@
1
- from typing import Tuple, Optional
2
-
3
- import gradio as gr
4
- import spaces
5
- import supervision as sv
6
- import torch
7
- from PIL import Image
8
- from gradio_image_prompter import ImagePrompter
9
-
10
- from utils.annotate import annotate_with_boxes
11
- from utils.models import load_models, run_inference, CHECKPOINTS, \
12
- pre_process_region_task_input, post_process_region_output
13
- from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
14
- CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
15
- MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
16
- IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
17
- TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
18
- IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
19
- DENSE_REGION_CAPTION_TASK_NAME
20
-
21
- MARKDOWN = """
22
- # Florence-2 🔥
23
-
24
- Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
25
- MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
26
- across tasks such as captioning, object detection, grounding, and segmentation.
27
- The model takes images and task prompts as input, generating the desired results in
28
- text format. It uses a DaViT vision encoder to convert images into visual token
29
- embeddings. These are then concatenated with BERT-generated text embeddings and
30
- processed by a transformer-based multi-modal encoder-decoder to generate the response.
31
- """
32
- EXAMPLES = [
33
- ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
34
- ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
35
- ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
36
- ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
37
- ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
38
- ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
39
- ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
40
- ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
41
- ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
42
- ]
43
-
44
- # DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
- DEVICE = "cuda"
46
- MODELS, PROCESSORS = load_models(DEVICE)
47
-
48
-
49
- @spaces.GPU
50
- def process(
51
- checkpoint_dropdown,
52
- task_dropdown,
53
- image_input,
54
- image_prompter_input
55
- ) -> Tuple[Optional[Image.Image], Optional[str]]:
56
- model = MODELS[checkpoint_dropdown]
57
- processor = PROCESSORS[checkpoint_dropdown]
58
- task = TASKS[task_dropdown]
59
-
60
- if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
61
- _, response = run_inference(
62
- model, processor, DEVICE, image_input, task)
63
- detections = sv.Detections.from_lmm(
64
- lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
65
- return annotate_with_boxes(image_input, detections), None
66
-
67
- elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
68
- _, response = run_inference(
69
- model, processor, DEVICE, image_input, task)
70
- return None, response[task]
71
-
72
- elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
73
- detections_list = []
74
-
75
- print(image_prompter_input)
76
-
77
- image_input = image_prompter_input["image"]
78
- for prompt in image_prompter_input["points"]:
79
- text = pre_process_region_task_input(
80
- prompt=prompt,
81
- resolution_wh=image_input.size
82
- )
83
- _, response = run_inference(
84
- model, processor, DEVICE, image_input, task, text)
85
- detections = sv.Detections.from_lmm(
86
- lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
87
- detections_list.append(detections)
88
- detections = sv.Detections.merge(detections_list=detections_list)
89
- detections = post_process_region_output(
90
- detections=detections, resolution_wh=image_input.size)
91
-
92
- return annotate_with_boxes(image_input, detections), None
93
-
94
-
95
- with gr.Blocks() as demo:
96
- gr.Markdown(MARKDOWN)
97
- with gr.Row():
98
- checkpoint_dropdown_component = gr.Dropdown(
99
- choices=CHECKPOINTS,
100
- value=CHECKPOINTS[0],
101
- label="Model", info="Select a Florence 2 model to use.",
102
- interactive=True
103
- )
104
- task_dropdown_component = gr.Dropdown(
105
- choices=TASK_NAMES,
106
- value=TASK_NAMES[0],
107
- label="Task", info="Select a task to perform with the model.",
108
- interactive=True
109
- )
110
-
111
- with gr.Row():
112
- with gr.Column():
113
- image_input_component = gr.Image(
114
- type='pil', label='Upload image')
115
- image_prompter_input_component = ImagePrompter(
116
- type='pil', label='Image prompt', visible=False)
117
- submit_button_component = gr.Button(value='Submit', variant='primary')
118
-
119
- with gr.Column():
120
- image_output_component = gr.Image(type='pil', label='Image Output')
121
- text_output_component = gr.Textbox(label='Caption Output', visible=False)
122
- with gr.Row():
123
- gr.Examples(
124
- fn=process,
125
- examples=EXAMPLES,
126
- inputs=[
127
- checkpoint_dropdown_component,
128
- task_dropdown_component,
129
- image_input_component,
130
- image_prompter_input_component
131
- ],
132
- outputs=[
133
- image_output_component,
134
- text_output_component
135
- ],
136
- run_on_click=True
137
- )
138
-
139
- def on_dropdown_change(text):
140
- return [
141
- gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
142
- ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
143
- gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
144
- gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
145
- ]
146
-
147
- task_dropdown_component.change(
148
- on_dropdown_change,
149
- inputs=[task_dropdown_component],
150
- outputs=[
151
- image_input_component,
152
- image_prompter_input_component,
153
- image_output_component,
154
- text_output_component
155
- ]
156
- )
157
- submit_button_component.click(
158
- fn=process,
159
- inputs=[
160
- checkpoint_dropdown_component,
161
- task_dropdown_component,
162
- image_input_component,
163
- image_prompter_input_component
164
- ],
165
- outputs=[
166
- image_output_component,
167
- text_output_component
168
- ]
169
- )
170
-
171
  demo.launch(debug=False, show_error=True, max_threads=1)
 
1
+ from typing import Tuple, Optional
2
+
3
+ import gradio as gr
4
+ import spaces
5
+ import supervision as sv
6
+ import torch
7
+ from PIL import Image
8
+ from gradio_image_prompter import ImagePrompter
9
+
10
+ from utils.annotate import annotate_with_boxes
11
+ from utils.models import load_models, run_inference, CHECKPOINTS, \
12
+ pre_process_region_task_input, post_process_region_output
13
+ from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
14
+ CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
15
+ MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
16
+ IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
17
+ TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
18
+ IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
19
+ DENSE_REGION_CAPTION_TASK_NAME
20
+
21
+ MARKDOWN = """
22
+ # Florence-2 🔥
23
+
24
+ Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
25
+ MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
26
+ across tasks such as captioning, object detection, grounding, and segmentation.
27
+ The model takes images and task prompts as input, generating the desired results in
28
+ text format. It uses a DaViT vision encoder to convert images into visual token
29
+ embeddings. These are then concatenated with BERT-generated text embeddings and
30
+ processed by a transformer-based multi-modal encoder-decoder to generate the response.
31
+ """
32
+ EXAMPLES = [
33
+ ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
34
+ ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
35
+ ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
36
+ ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
37
+ ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
38
+ ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
39
+ ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
40
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
41
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
42
+ ]
43
+
44
+ # DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+ DEVICE = "cuda"
46
+ MODELS, PROCESSORS = load_models(DEVICE)
47
+
48
+
49
+ @spaces.GPU
50
+ def process(
51
+ checkpoint_dropdown,
52
+ task_dropdown,
53
+ image_input,
54
+ image_prompter_input
55
+ ) -> Tuple[Optional[Image.Image], Optional[str]]:
56
+ model = MODELS[checkpoint_dropdown]
57
+ processor = PROCESSORS[checkpoint_dropdown]
58
+ task = TASKS[task_dropdown]
59
+
60
+ if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
61
+ _, response = run_inference(
62
+ model, processor, DEVICE, image_input, task)
63
+ detections = sv.Detections.from_lmm(
64
+ lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
65
+ return annotate_with_boxes(image_input, detections), None
66
+
67
+ elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
68
+ _, response = run_inference(
69
+ model, processor, DEVICE, image_input, task)
70
+ return None, response[task]
71
+
72
+ elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
73
+ detections_list = []
74
+
75
+ print(image_prompter_input)
76
+
77
+ image_input = image_prompter_input["image"]
78
+ for prompt in image_prompter_input["points"]:
79
+ text = pre_process_region_task_input(
80
+ prompt=prompt,
81
+ resolution_wh=image_input.size
82
+ )
83
+ _, response = run_inference(
84
+ model, processor, DEVICE, image_input, task, text)
85
+ detections = sv.Detections.from_lmm(
86
+ lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
87
+ detections_list.append(detections)
88
+ detections = sv.Detections.merge(detections_list=detections_list)
89
+ detections = post_process_region_output(
90
+ detections=detections, resolution_wh=image_input.size)
91
+
92
+ return annotate_with_boxes(image_input, detections), None
93
+
94
+
95
+ with gr.Blocks() as demo:
96
+ gr.Markdown(MARKDOWN)
97
+ with gr.Row():
98
+ checkpoint_dropdown_component = gr.Dropdown(
99
+ choices=CHECKPOINTS,
100
+ value=CHECKPOINTS[0],
101
+ label="Model", info="Select a Florence 2 model to use.",
102
+ interactive=True
103
+ )
104
+ task_dropdown_component = gr.Dropdown(
105
+ choices=TASK_NAMES,
106
+ value=TASK_NAMES[0],
107
+ label="Task", info="Select a task to perform with the model.",
108
+ interactive=True
109
+ )
110
+
111
+ with gr.Row():
112
+ with gr.Column():
113
+ image_input_component = gr.Image(
114
+ type='pil', label='Upload image')
115
+ image_prompter_input_component = ImagePrompter(
116
+ type='pil', label='Image prompt', visible=False)
117
+ submit_button_component = gr.Button(value='Submit', variant='primary')
118
+
119
+ with gr.Column():
120
+ image_output_component = gr.Image(type='pil', label='Image Output')
121
+ text_output_component = gr.Textbox(label='Caption Output', visible=False)
122
+ with gr.Row():
123
+ gr.Examples(
124
+ fn=process,
125
+ examples=EXAMPLES,
126
+ inputs=[
127
+ checkpoint_dropdown_component,
128
+ task_dropdown_component,
129
+ image_input_component,
130
+ image_prompter_input_component
131
+ ],
132
+ outputs=[
133
+ image_output_component,
134
+ text_output_component
135
+ ],
136
+ run_on_click=True
137
+ )
138
+
139
+ def on_dropdown_change(text):
140
+ return [
141
+ gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
142
+ ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
143
+ gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
144
+ gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
145
+ ]
146
+
147
+ task_dropdown_component.change(
148
+ on_dropdown_change,
149
+ inputs=[task_dropdown_component],
150
+ outputs=[
151
+ image_input_component,
152
+ image_prompter_input_component,
153
+ image_output_component,
154
+ text_output_component
155
+ ]
156
+ )
157
+ submit_button_component.click(
158
+ fn=process,
159
+ inputs=[
160
+ checkpoint_dropdown_component,
161
+ task_dropdown_component,
162
+ image_input_component,
163
+ image_prompter_input_component
164
+ ],
165
+ outputs=[
166
+ image_output_component,
167
+ text_output_component
168
+ ]
169
+ )
170
+
171
  demo.launch(debug=False, show_error=True, max_threads=1)