gokaygokay commited on
Commit
d502400
1 Parent(s): b9bfc9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -25
app.py CHANGED
@@ -16,9 +16,19 @@ import numpy as np
16
  import subprocess
17
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
18
 
19
- model_id = 'microsoft/Florence-2-large'
20
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda").eval()
21
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  DESCRIPTION = "# [Florence-2 Demo](https://huggingface.co/microsoft/Florence-2-large)"
@@ -32,8 +42,10 @@ def fig_to_pil(fig):
32
  buf.seek(0)
33
  return Image.open(buf)
34
 
35
- @spaces.GPU
36
- def run_example(task_prompt, image, text_input=None):
 
 
37
  if text_input is None:
38
  prompt = task_prompt
39
  else:
@@ -109,73 +121,73 @@ def draw_ocr_bboxes(image, prediction):
109
  fill=color)
110
  return image
111
 
112
- def process_image(image, task_prompt, text_input=None):
113
  image = Image.fromarray(image) # Convert NumPy array to PIL Image
114
  if task_prompt == 'Caption':
115
  task_prompt = '<CAPTION>'
116
- result = run_example(task_prompt, image)
117
- return result, None
118
  elif task_prompt == 'Detailed Caption':
119
  task_prompt = '<DETAILED_CAPTION>'
120
- result = run_example(task_prompt, image)
121
- return result, None
122
  elif task_prompt == 'More Detailed Caption':
123
  task_prompt = '<MORE_DETAILED_CAPTION>'
124
- result = run_example(task_prompt, image)
125
- return result, None
126
  elif task_prompt == 'Object Detection':
127
  task_prompt = '<OD>'
128
- results = run_example(task_prompt, image)
129
  fig = plot_bbox(image, results['<OD>'])
130
  return results, fig_to_pil(fig)
131
  elif task_prompt == 'Dense Region Caption':
132
  task_prompt = '<DENSE_REGION_CAPTION>'
133
- results = run_example(task_prompt, image)
134
  fig = plot_bbox(image, results['<DENSE_REGION_CAPTION>'])
135
  return results, fig_to_pil(fig)
136
  elif task_prompt == 'Region Proposal':
137
  task_prompt = '<REGION_PROPOSAL>'
138
- results = run_example(task_prompt, image)
139
  fig = plot_bbox(image, results['<REGION_PROPOSAL>'])
140
  return results, fig_to_pil(fig)
141
  elif task_prompt == 'Caption to Phrase Grounding':
142
  task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
143
- results = run_example(task_prompt, image, text_input)
144
  fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
145
  return results, fig_to_pil(fig)
146
  elif task_prompt == 'Referring Expression Segmentation':
147
  task_prompt = '<REFERRING_EXPRESSION_SEGMENTATION>'
148
- results = run_example(task_prompt, image, text_input)
149
  output_image = copy.deepcopy(image)
150
  output_image = draw_polygons(output_image, results['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)
151
  return results, output_image
152
  elif task_prompt == 'Region to Segmentation':
153
  task_prompt = '<REGION_TO_SEGMENTATION>'
154
- results = run_example(task_prompt, image, text_input)
155
  output_image = copy.deepcopy(image)
156
  output_image = draw_polygons(output_image, results['<REGION_TO_SEGMENTATION>'], fill_mask=True)
157
  return results, output_image
158
  elif task_prompt == 'Open Vocabulary Detection':
159
  task_prompt = '<OPEN_VOCABULARY_DETECTION>'
160
- results = run_example(task_prompt, image, text_input)
161
  bbox_results = convert_to_od_format(results['<OPEN_VOCABULARY_DETECTION>'])
162
  fig = plot_bbox(image, bbox_results)
163
  return results, fig_to_pil(fig)
164
  elif task_prompt == 'Region to Category':
165
  task_prompt = '<REGION_TO_CATEGORY>'
166
- results = run_example(task_prompt, image, text_input)
167
  return results, None
168
  elif task_prompt == 'Region to Description':
169
  task_prompt = '<REGION_TO_DESCRIPTION>'
170
- results = run_example(task_prompt, image, text_input)
171
  return results, None
172
  elif task_prompt == 'OCR':
173
  task_prompt = '<OCR>'
174
- result = run_example(task_prompt, image)
175
- return result, None
176
  elif task_prompt == 'OCR with Region':
177
  task_prompt = '<OCR_WITH_REGION>'
178
- results = run_example(task_prompt, image)
179
  output_image = copy.deepcopy(image)
180
  output_image = draw_ocr_bboxes(output_image, results['<OCR_WITH_REGION>'])
181
  return results, output_image
@@ -196,6 +208,7 @@ with gr.Blocks(css=css) as demo:
196
  with gr.Row():
197
  with gr.Column():
198
  input_img = gr.Image(label="Input Picture")
 
199
  task_prompt = gr.Dropdown(choices=[
200
  'Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection',
201
  'Dense Region Caption', 'Region Proposal', 'Caption to Phrase Grounding',
@@ -221,6 +234,6 @@ with gr.Blocks(css=css) as demo:
221
  label='Try examples'
222
  )
223
 
224
- submit_btn.click(process_image, [input_img, task_prompt, text_input], [output_text, output_img])
225
 
226
  demo.launch(debug=True)
 
16
  import subprocess
17
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
18
 
19
+ models = {
20
+ 'microsoft/Florence-2-large-ft': AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large-ft', trust_remote_code=True).to("cuda").eval(),
21
+ 'microsoft/Florence-2-large': AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to("cuda").eval(),
22
+ 'microsoft/Florence-2-base-ft': AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base-ft', trust_remote_code=True).to("cuda").eval(),
23
+ 'microsoft/Florence-2-base': AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to("cuda").eval(),
24
+ }
25
+
26
+ processors = {
27
+ 'microsoft/Florence-2-large-ft': AutoProcessor.from_pretrained('microsoft/Florence-2-large-ft', trust_remote_code=True),
28
+ 'microsoft/Florence-2-large': AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True),
29
+ 'microsoft/Florence-2-base-ft': AutoProcessor.from_pretrained('microsoft/Florence-2-base-ft', trust_remote_code=True),
30
+ 'microsoft/Florence-2-base': AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True),
31
+ }
32
 
33
 
34
  DESCRIPTION = "# [Florence-2 Demo](https://huggingface.co/microsoft/Florence-2-large)"
 
42
  buf.seek(0)
43
  return Image.open(buf)
44
 
45
+
46
+ def run_example(task_prompt, image, text_input=None, model_id='microsoft/Florence-2-large'):
47
+ model = models[model_id]
48
+ processor = processors[model_id]
49
  if text_input is None:
50
  prompt = task_prompt
51
  else:
 
121
  fill=color)
122
  return image
123
 
124
+ def process_image(image, task_prompt, text_input=None, model_id='microsoft/Florence-2-large'):
125
  image = Image.fromarray(image) # Convert NumPy array to PIL Image
126
  if task_prompt == 'Caption':
127
  task_prompt = '<CAPTION>'
128
+ results = run_example(task_prompt, image, model_id=model_id)
129
+ return results, None
130
  elif task_prompt == 'Detailed Caption':
131
  task_prompt = '<DETAILED_CAPTION>'
132
+ results = run_example(task_prompt, image, model_id=model_id)
133
+ return results, None
134
  elif task_prompt == 'More Detailed Caption':
135
  task_prompt = '<MORE_DETAILED_CAPTION>'
136
+ results = run_example(task_prompt, image, model_id=model_id)
137
+ return results, None
138
  elif task_prompt == 'Object Detection':
139
  task_prompt = '<OD>'
140
+ results = run_example(task_prompt, image, model_id=model_id)
141
  fig = plot_bbox(image, results['<OD>'])
142
  return results, fig_to_pil(fig)
143
  elif task_prompt == 'Dense Region Caption':
144
  task_prompt = '<DENSE_REGION_CAPTION>'
145
+ results = run_example(task_prompt, image, model_id=model_id)
146
  fig = plot_bbox(image, results['<DENSE_REGION_CAPTION>'])
147
  return results, fig_to_pil(fig)
148
  elif task_prompt == 'Region Proposal':
149
  task_prompt = '<REGION_PROPOSAL>'
150
+ results = run_example(task_prompt, image, model_id=model_id)
151
  fig = plot_bbox(image, results['<REGION_PROPOSAL>'])
152
  return results, fig_to_pil(fig)
153
  elif task_prompt == 'Caption to Phrase Grounding':
154
  task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
155
+ results = run_example(task_prompt, image, text_input, model_id)
156
  fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
157
  return results, fig_to_pil(fig)
158
  elif task_prompt == 'Referring Expression Segmentation':
159
  task_prompt = '<REFERRING_EXPRESSION_SEGMENTATION>'
160
+ results = run_example(task_prompt, image, text_input, model_id)
161
  output_image = copy.deepcopy(image)
162
  output_image = draw_polygons(output_image, results['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)
163
  return results, output_image
164
  elif task_prompt == 'Region to Segmentation':
165
  task_prompt = '<REGION_TO_SEGMENTATION>'
166
+ results = run_example(task_prompt, image, text_input, model_id)
167
  output_image = copy.deepcopy(image)
168
  output_image = draw_polygons(output_image, results['<REGION_TO_SEGMENTATION>'], fill_mask=True)
169
  return results, output_image
170
  elif task_prompt == 'Open Vocabulary Detection':
171
  task_prompt = '<OPEN_VOCABULARY_DETECTION>'
172
+ results = run_example(task_prompt, image, text_input, model_id)
173
  bbox_results = convert_to_od_format(results['<OPEN_VOCABULARY_DETECTION>'])
174
  fig = plot_bbox(image, bbox_results)
175
  return results, fig_to_pil(fig)
176
  elif task_prompt == 'Region to Category':
177
  task_prompt = '<REGION_TO_CATEGORY>'
178
+ results = run_example(task_prompt, image, text_input, model_id)
179
  return results, None
180
  elif task_prompt == 'Region to Description':
181
  task_prompt = '<REGION_TO_DESCRIPTION>'
182
+ results = run_example(task_prompt, image, text_input, model_id)
183
  return results, None
184
  elif task_prompt == 'OCR':
185
  task_prompt = '<OCR>'
186
+ results = run_example(task_prompt, image, model_id=model_id)
187
+ return results, None
188
  elif task_prompt == 'OCR with Region':
189
  task_prompt = '<OCR_WITH_REGION>'
190
+ results = run_example(task_prompt, image, model_id=model_id)
191
  output_image = copy.deepcopy(image)
192
  output_image = draw_ocr_bboxes(output_image, results['<OCR_WITH_REGION>'])
193
  return results, output_image
 
208
  with gr.Row():
209
  with gr.Column():
210
  input_img = gr.Image(label="Input Picture")
211
+ model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value='microsoft/Florence-2-large')
212
  task_prompt = gr.Dropdown(choices=[
213
  'Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection',
214
  'Dense Region Caption', 'Region Proposal', 'Caption to Phrase Grounding',
 
234
  label='Try examples'
235
  )
236
 
237
+ submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text, output_img])
238
 
239
  demo.launch(debug=True)