gokaygokay commited on
Commit
6172e67
1 Parent(s): 062585d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -96
app.py CHANGED
@@ -12,7 +12,6 @@ import matplotlib.patches as patches
12
 
13
  import random
14
  import numpy as np
15
- import cv2
16
 
17
  import subprocess
18
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -22,7 +21,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).t
22
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
23
 
24
 
25
- DESCRIPTION = "# [Florence-2 Video Demo](https://huggingface.co/microsoft/Florence-2-large)"
26
 
27
  colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
28
  'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@@ -68,6 +67,7 @@ def plot_bbox(image, data):
68
  return fig
69
 
70
  def draw_polygons(image, prediction, fill_mask=False):
 
71
  draw = ImageDraw.Draw(image)
72
  scale = 1
73
  for polygons, label in zip(prediction['polygons'], prediction['labels']):
@@ -86,6 +86,15 @@ def draw_polygons(image, prediction, fill_mask=False):
86
  draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
87
  return image
88
 
 
 
 
 
 
 
 
 
 
89
  def draw_ocr_bboxes(image, prediction):
90
  scale = 1
91
  draw = ImageDraw.Draw(image)
@@ -100,98 +109,118 @@ def draw_ocr_bboxes(image, prediction):
100
  fill=color)
101
  return image
102
 
103
- def process_video(video_path, task_prompt, text_input=None):
104
- video = cv2.VideoCapture(video_path)
105
- fps = video.get(cv2.CAP_PROP_FPS)
106
- width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
107
- height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
108
-
109
- output_frames = []
110
-
111
- while True:
112
- ret, frame = video.read()
113
- if not ret:
114
- break
115
-
116
- image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
117
-
118
- if task_prompt == 'Caption':
119
- task_prompt = '<CAPTION>'
120
- result = run_example(task_prompt, image)
121
- output_frames.append(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
122
- elif task_prompt == 'Detailed Caption':
123
- task_prompt = '<DETAILED_CAPTION>'
124
- result = run_example(task_prompt, image)
125
- output_frames.append(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
126
- elif task_prompt == 'More Detailed Caption':
127
- task_prompt = '<MORE_DETAILED_CAPTION>'
128
- result = run_example(task_prompt, image)
129
- output_frames.append(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
130
- elif task_prompt == 'Object Detection':
131
- task_prompt = '<OD>'
132
- results = run_example(task_prompt, image)
133
- fig = plot_bbox(image, results['<OD>'])
134
- output_frames.append(cv2.cvtColor(np.array(fig_to_pil(fig)), cv2.COLOR_RGB2BGR))
135
- elif task_prompt == 'Referring Expression Segmentation':
136
- task_prompt = '<REF_SEG>'
137
- results = run_example(task_prompt, image, text_input)
138
- annotated_image = draw_polygons(image.copy(), results['<REF_SEG>'])
139
- output_frames.append(cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR))
140
- elif task_prompt == 'OCR':
141
- task_prompt = '<OCR>'
142
- results = run_example(task_prompt, image)
143
- annotated_image = draw_ocr_bboxes(image.copy(), results['<OCR>'])
144
- output_frames.append(cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR))
145
- else:
146
- raise ValueError(f"Unsupported task prompt: {task_prompt}")
147
-
148
- video.release()
149
-
150
- output_path = 'output_video.mp4'
151
- out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
152
- for frame in output_frames:
153
- out.write(frame)
154
- out.release()
155
-
156
- return output_path
157
-
158
- task_prompts = ['Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection', 'Referring Expression Segmentation', 'OCR']
159
-
160
- with gr.Blocks(css="style.css") as demo:
161
- with gr.Group():
162
- with gr.Row():
163
- video_input = gr.Video(
164
- label='Input Video',
165
- format='mp4',
166
- source='upload',
167
- interactive=True
168
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  with gr.Row():
170
- select_task = gr.Dropdown(
171
- label='Task Prompt',
172
- choices=task_prompts,
173
- value=task_prompts[0],
174
- interactive=True
175
- )
176
- text_input = gr.Textbox(
177
- label='Text Input (optional)',
178
- visible=False
179
- )
180
- submit = gr.Button(
181
- label='Process Video',
182
- scale=1,
183
- variant='primary'
184
- )
185
- video_output = gr.Video(
186
- label='Florence-2 Video Demo',
187
- format='mp4',
188
- interactive=False
189
- )
190
-
191
- submit.click(
192
- fn=process_video,
193
- inputs=[video_input, select_task, text_input],
194
- outputs=video_output,
195
- )
196
-
197
- demo.queue().launch()
 
 
 
12
 
13
  import random
14
  import numpy as np
 
15
 
16
  import subprocess
17
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
21
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
22
 
23
 
24
+ DESCRIPTION = "# [Florence-2 Demo](https://huggingface.co/microsoft/Florence-2-large)"
25
 
26
  colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
27
  'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
 
67
  return fig
68
 
69
  def draw_polygons(image, prediction, fill_mask=False):
70
+
71
  draw = ImageDraw.Draw(image)
72
  scale = 1
73
  for polygons, label in zip(prediction['polygons'], prediction['labels']):
 
86
  draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
87
  return image
88
 
89
+ def convert_to_od_format(data):
90
+ bboxes = data.get('bboxes', [])
91
+ labels = data.get('bboxes_labels', [])
92
+ od_results = {
93
+ 'bboxes': bboxes,
94
+ 'labels': labels
95
+ }
96
+ return od_results
97
+
98
  def draw_ocr_bboxes(image, prediction):
99
  scale = 1
100
  draw = ImageDraw.Draw(image)
 
109
  fill=color)
110
  return image
111
 
112
+ def process_image(image, task_prompt, text_input=None):
113
+ image = Image.fromarray(image) # Convert NumPy array to PIL Image
114
+ if task_prompt == 'Caption':
115
+ task_prompt = '<CAPTION>'
116
+ result = run_example(task_prompt, image)
117
+ return result, None
118
+ elif task_prompt == 'Detailed Caption':
119
+ task_prompt = '<DETAILED_CAPTION>'
120
+ result = run_example(task_prompt, image)
121
+ return result, None
122
+ elif task_prompt == 'More Detailed Caption':
123
+ task_prompt = '<MORE_DETAILED_CAPTION>'
124
+ result = run_example(task_prompt, image)
125
+ return result, None
126
+ elif task_prompt == 'Object Detection':
127
+ task_prompt = '<OD>'
128
+ results = run_example(task_prompt, image)
129
+ fig = plot_bbox(image, results['<OD>'])
130
+ return results, fig_to_pil(fig)
131
+ elif task_prompt == 'Dense Region Caption':
132
+ task_prompt = '<DENSE_REGION_CAPTION>'
133
+ results = run_example(task_prompt, image)
134
+ fig = plot_bbox(image, results['<DENSE_REGION_CAPTION>'])
135
+ return results, fig_to_pil(fig)
136
+ elif task_prompt == 'Region Proposal':
137
+ task_prompt = '<REGION_PROPOSAL>'
138
+ results = run_example(task_prompt, image)
139
+ fig = plot_bbox(image, results['<REGION_PROPOSAL>'])
140
+ return results, fig_to_pil(fig)
141
+ elif task_prompt == 'Caption to Phrase Grounding':
142
+ task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
143
+ results = run_example(task_prompt, image, text_input)
144
+ fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
145
+ return results, fig_to_pil(fig)
146
+ elif task_prompt == 'Referring Expression Segmentation':
147
+ task_prompt = '<REFERRING_EXPRESSION_SEGMENTATION>'
148
+ results = run_example(task_prompt, image, text_input)
149
+ output_image = copy.deepcopy(image)
150
+ output_image = draw_polygons(output_image, results['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)
151
+ return results, output_image
152
+ elif task_prompt == 'Region to Segmentation':
153
+ task_prompt = '<REGION_TO_SEGMENTATION>'
154
+ results = run_example(task_prompt, image, text_input)
155
+ output_image = copy.deepcopy(image)
156
+ output_image = draw_polygons(output_image, results['<REGION_TO_SEGMENTATION>'], fill_mask=True)
157
+ return results, output_image
158
+ elif task_prompt == 'Open Vocabulary Detection':
159
+ task_prompt = '<OPEN_VOCABULARY_DETECTION>'
160
+ results = run_example(task_prompt, image, text_input)
161
+ bbox_results = convert_to_od_format(results['<OPEN_VOCABULARY_DETECTION>'])
162
+ fig = plot_bbox(image, bbox_results)
163
+ return results, fig_to_pil(fig)
164
+ elif task_prompt == 'Region to Category':
165
+ task_prompt = '<REGION_TO_CATEGORY>'
166
+ results = run_example(task_prompt, image, text_input)
167
+ return results, None
168
+ elif task_prompt == 'Region to Description':
169
+ task_prompt = '<REGION_TO_DESCRIPTION>'
170
+ results = run_example(task_prompt, image, text_input)
171
+ return results, None
172
+ elif task_prompt == 'OCR':
173
+ task_prompt = '<OCR>'
174
+ result = run_example(task_prompt, image)
175
+ return result, None
176
+ elif task_prompt == 'OCR with Region':
177
+ task_prompt = '<OCR_WITH_REGION>'
178
+ results = run_example(task_prompt, image)
179
+ output_image = copy.deepcopy(image)
180
+ output_image = draw_ocr_bboxes(output_image, results['<OCR_WITH_REGION>'])
181
+ return results, output_image
182
+ else:
183
+ return "", None # Return empty string and None for unknown task prompts
184
+
185
+ css = """
186
+ #output {
187
+ height: 500px;
188
+ overflow: auto;
189
+ border: 1px solid #ccc;
190
+ }
191
+ """
192
+
193
+ with gr.Blocks(css=css) as demo:
194
+ gr.Markdown(DESCRIPTION)
195
+ with gr.Tab(label="Florence-2 Image Captioning"):
196
  with gr.Row():
197
+ with gr.Column():
198
+ input_img = gr.Image(label="Input Picture")
199
+ task_prompt = gr.Dropdown(choices=[
200
+ 'Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection',
201
+ 'Dense Region Caption', 'Region Proposal', 'Caption to Phrase Grounding',
202
+ 'Referring Expression Segmentation', 'Region to Segmentation',
203
+ 'Open Vocabulary Detection', 'Region to Category', 'Region to Description',
204
+ 'OCR', 'OCR with Region'
205
+ ], label="Task Prompt", value= 'Caption')
206
+ text_input = gr.Textbox(label="Text Input (optional)")
207
+ submit_btn = gr.Button(value="Submit")
208
+ with gr.Column():
209
+ output_text = gr.Textbox(label="Output Text")
210
+ output_img = gr.Image(label="Output Image")
211
+
212
+ gr.Examples(
213
+ examples=[
214
+ ["image1.jpg", 'Object Detection'],
215
+ ["image2.jpg", 'OCR with Region']
216
+ ],
217
+ inputs=[input_img, task_prompt],
218
+ outputs=[output_text, output_img],
219
+ fn=process_image,
220
+ cache_examples=True,
221
+ label='Try examples'
222
+ )
223
+
224
+ submit_btn.click(process_image, [input_img, task_prompt, text_input], [output_text, output_img])
225
+
226
+ demo.launch(debug=True)