gokaygokay commited on
Commit
062585d
1 Parent(s): 3780820

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -125
app.py CHANGED
@@ -12,6 +12,7 @@ import matplotlib.patches as patches
12
 
13
  import random
14
  import numpy as np
 
15
 
16
  import subprocess
17
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -21,7 +22,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).t
21
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
22
 
23
 
24
- DESCRIPTION = "# [Florence-2 Demo](https://huggingface.co/microsoft/Florence-2-large)"
25
 
26
  colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
27
  'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@@ -67,7 +68,6 @@ def plot_bbox(image, data):
67
  return fig
68
 
69
  def draw_polygons(image, prediction, fill_mask=False):
70
-
71
  draw = ImageDraw.Draw(image)
72
  scale = 1
73
  for polygons, label in zip(prediction['polygons'], prediction['labels']):
@@ -86,15 +86,6 @@ def draw_polygons(image, prediction, fill_mask=False):
86
  draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
87
  return image
88
 
89
- def convert_to_od_format(data):
90
- bboxes = data.get('bboxes', [])
91
- labels = data.get('bboxes_labels', [])
92
- od_results = {
93
- 'bboxes': bboxes,
94
- 'labels': labels
95
- }
96
- return od_results
97
-
98
  def draw_ocr_bboxes(image, prediction):
99
  scale = 1
100
  draw = ImageDraw.Draw(image)
@@ -109,118 +100,98 @@ def draw_ocr_bboxes(image, prediction):
109
  fill=color)
110
  return image
111
 
112
- def process_image(image, task_prompt, text_input=None):
113
- image = Image.fromarray(image) # Convert NumPy array to PIL Image
114
- if task_prompt == 'Caption':
115
- task_prompt = '<CAPTION>'
116
- result = run_example(task_prompt, image)
117
- return result, None
118
- elif task_prompt == 'Detailed Caption':
119
- task_prompt = '<DETAILED_CAPTION>'
120
- result = run_example(task_prompt, image)
121
- return result, None
122
- elif task_prompt == 'More Detailed Caption':
123
- task_prompt = '<MORE_DETAILED_CAPTION>'
124
- result = run_example(task_prompt, image)
125
- return result, None
126
- elif task_prompt == 'Object Detection':
127
- task_prompt = '<OD>'
128
- results = run_example(task_prompt, image)
129
- fig = plot_bbox(image, results['<OD>'])
130
- return results, fig_to_pil(fig)
131
- elif task_prompt == 'Dense Region Caption':
132
- task_prompt = '<DENSE_REGION_CAPTION>'
133
- results = run_example(task_prompt, image)
134
- fig = plot_bbox(image, results['<DENSE_REGION_CAPTION>'])
135
- return results, fig_to_pil(fig)
136
- elif task_prompt == 'Region Proposal':
137
- task_prompt = '<REGION_PROPOSAL>'
138
- results = run_example(task_prompt, image)
139
- fig = plot_bbox(image, results['<REGION_PROPOSAL>'])
140
- return results, fig_to_pil(fig)
141
- elif task_prompt == 'Caption to Phrase Grounding':
142
- task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
143
- results = run_example(task_prompt, image, text_input)
144
- fig = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
145
- return results, fig_to_pil(fig)
146
- elif task_prompt == 'Referring Expression Segmentation':
147
- task_prompt = '<REFERRING_EXPRESSION_SEGMENTATION>'
148
- results = run_example(task_prompt, image, text_input)
149
- output_image = copy.deepcopy(image)
150
- output_image = draw_polygons(output_image, results['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)
151
- return results, output_image
152
- elif task_prompt == 'Region to Segmentation':
153
- task_prompt = '<REGION_TO_SEGMENTATION>'
154
- results = run_example(task_prompt, image, text_input)
155
- output_image = copy.deepcopy(image)
156
- output_image = draw_polygons(output_image, results['<REGION_TO_SEGMENTATION>'], fill_mask=True)
157
- return results, output_image
158
- elif task_prompt == 'Open Vocabulary Detection':
159
- task_prompt = '<OPEN_VOCABULARY_DETECTION>'
160
- results = run_example(task_prompt, image, text_input)
161
- bbox_results = convert_to_od_format(results['<OPEN_VOCABULARY_DETECTION>'])
162
- fig = plot_bbox(image, bbox_results)
163
- return results, fig_to_pil(fig)
164
- elif task_prompt == 'Region to Category':
165
- task_prompt = '<REGION_TO_CATEGORY>'
166
- results = run_example(task_prompt, image, text_input)
167
- return results, None
168
- elif task_prompt == 'Region to Description':
169
- task_prompt = '<REGION_TO_DESCRIPTION>'
170
- results = run_example(task_prompt, image, text_input)
171
- return results, None
172
- elif task_prompt == 'OCR':
173
- task_prompt = '<OCR>'
174
- result = run_example(task_prompt, image)
175
- return result, None
176
- elif task_prompt == 'OCR with Region':
177
- task_prompt = '<OCR_WITH_REGION>'
178
- results = run_example(task_prompt, image)
179
- output_image = copy.deepcopy(image)
180
- output_image = draw_ocr_bboxes(output_image, results['<OCR_WITH_REGION>'])
181
- return results, output_image
182
- else:
183
- return "", None # Return empty string and None for unknown task prompts
184
-
185
- css = """
186
- #output {
187
- height: 500px;
188
- overflow: auto;
189
- border: 1px solid #ccc;
190
- }
191
- """
192
-
193
- with gr.Blocks(css=css) as demo:
194
- gr.Markdown(DESCRIPTION)
195
- with gr.Tab(label="Florence-2 Image Captioning"):
196
  with gr.Row():
197
- with gr.Column():
198
- input_img = gr.Image(label="Input Picture")
199
- task_prompt = gr.Dropdown(choices=[
200
- 'Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection',
201
- 'Dense Region Caption', 'Region Proposal', 'Caption to Phrase Grounding',
202
- 'Referring Expression Segmentation', 'Region to Segmentation',
203
- 'Open Vocabulary Detection', 'Region to Category', 'Region to Description',
204
- 'OCR', 'OCR with Region'
205
- ], label="Task Prompt", value= 'Caption')
206
- text_input = gr.Textbox(label="Text Input (optional)")
207
- submit_btn = gr.Button(value="Submit")
208
- with gr.Column():
209
- output_text = gr.Textbox(label="Output Text")
210
- output_img = gr.Image(label="Output Image")
211
-
212
- gr.Examples(
213
- examples=[
214
- ["image1.jpg", 'Object Detection'],
215
- ["image2.jpg", 'OCR with Region']
216
- ],
217
- inputs=[input_img, task_prompt],
218
- outputs=[output_text, output_img],
219
- fn=process_image,
220
- cache_examples=True,
221
- label='Try examples'
222
- )
223
-
224
- submit_btn.click(process_image, [input_img, task_prompt, text_input], [output_text, output_img])
225
-
226
- demo.launch(debug=True)
 
12
 
13
  import random
14
  import numpy as np
15
+ import cv2
16
 
17
  import subprocess
18
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
22
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
23
 
24
 
25
+ DESCRIPTION = "# [Florence-2 Video Demo](https://huggingface.co/microsoft/Florence-2-large)"
26
 
27
  colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
28
  'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
 
68
  return fig
69
 
70
  def draw_polygons(image, prediction, fill_mask=False):
 
71
  draw = ImageDraw.Draw(image)
72
  scale = 1
73
  for polygons, label in zip(prediction['polygons'], prediction['labels']):
 
86
  draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
87
  return image
88
 
 
 
 
 
 
 
 
 
 
89
  def draw_ocr_bboxes(image, prediction):
90
  scale = 1
91
  draw = ImageDraw.Draw(image)
 
100
  fill=color)
101
  return image
102
 
103
+ def process_video(video_path, task_prompt, text_input=None):
104
+ video = cv2.VideoCapture(video_path)
105
+ fps = video.get(cv2.CAP_PROP_FPS)
106
+ width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
107
+ height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
108
+
109
+ output_frames = []
110
+
111
+ while True:
112
+ ret, frame = video.read()
113
+ if not ret:
114
+ break
115
+
116
+ image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
117
+
118
+ if task_prompt == 'Caption':
119
+ task_prompt = '<CAPTION>'
120
+ result = run_example(task_prompt, image)
121
+ output_frames.append(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
122
+ elif task_prompt == 'Detailed Caption':
123
+ task_prompt = '<DETAILED_CAPTION>'
124
+ result = run_example(task_prompt, image)
125
+ output_frames.append(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
126
+ elif task_prompt == 'More Detailed Caption':
127
+ task_prompt = '<MORE_DETAILED_CAPTION>'
128
+ result = run_example(task_prompt, image)
129
+ output_frames.append(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
130
+ elif task_prompt == 'Object Detection':
131
+ task_prompt = '<OD>'
132
+ results = run_example(task_prompt, image)
133
+ fig = plot_bbox(image, results['<OD>'])
134
+ output_frames.append(cv2.cvtColor(np.array(fig_to_pil(fig)), cv2.COLOR_RGB2BGR))
135
+ elif task_prompt == 'Referring Expression Segmentation':
136
+ task_prompt = '<REF_SEG>'
137
+ results = run_example(task_prompt, image, text_input)
138
+ annotated_image = draw_polygons(image.copy(), results['<REF_SEG>'])
139
+ output_frames.append(cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR))
140
+ elif task_prompt == 'OCR':
141
+ task_prompt = '<OCR>'
142
+ results = run_example(task_prompt, image)
143
+ annotated_image = draw_ocr_bboxes(image.copy(), results['<OCR>'])
144
+ output_frames.append(cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR))
145
+ else:
146
+ raise ValueError(f"Unsupported task prompt: {task_prompt}")
147
+
148
+ video.release()
149
+
150
+ output_path = 'output_video.mp4'
151
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
152
+ for frame in output_frames:
153
+ out.write(frame)
154
+ out.release()
155
+
156
+ return output_path
157
+
158
+ task_prompts = ['Caption', 'Detailed Caption', 'More Detailed Caption', 'Object Detection', 'Referring Expression Segmentation', 'OCR']
159
+
160
+ with gr.Blocks(css="style.css") as demo:
161
+ with gr.Group():
162
+ with gr.Row():
163
+ video_input = gr.Video(
164
+ label='Input Video',
165
+ format='mp4',
166
+ source='upload',
167
+ interactive=True
168
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  with gr.Row():
170
+ select_task = gr.Dropdown(
171
+ label='Task Prompt',
172
+ choices=task_prompts,
173
+ value=task_prompts[0],
174
+ interactive=True
175
+ )
176
+ text_input = gr.Textbox(
177
+ label='Text Input (optional)',
178
+ visible=False
179
+ )
180
+ submit = gr.Button(
181
+ label='Process Video',
182
+ scale=1,
183
+ variant='primary'
184
+ )
185
+ video_output = gr.Video(
186
+ label='Florence-2 Video Demo',
187
+ format='mp4',
188
+ interactive=False
189
+ )
190
+
191
+ submit.click(
192
+ fn=process_video,
193
+ inputs=[video_input, select_task, text_input],
194
+ outputs=video_output,
195
+ )
196
+
197
+ demo.queue().launch()