OPEN_VOCABULARY_DETECTION returns polygons which cannot be displayed
#16
by
MoritzLaurer
HF staff
- opened
When I run OPEN_VOCABULARY_DETECTION on some images, it return polygons instead of bboxes. It seems like the resulting polygons cannot be displayed on the image with the example code from the example notebook
task_prompt = '<OPEN_VOCABULARY_DETECTION>'
def florence2_inference(task_prompt, text_input=None):
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
return parsed_answer
results = florence2_inference(task_prompt, text_input="The face of the character in the middle of the illustration. The expression is neutral, with distinct hair and facial features. The head is tilted slightly to the right.")
print(results)
{'<OPEN_VOCABULARY_DETECTION>': {'bboxes': [], 'bboxes_labels': [], 'polygons': [[[0.49000000953674316, 0.367000013589859, 979.510009765625, 0.367000013589859, 979.510009765625, 1.1010000705718994, 0.49000000953674316, 0.367000013589859]]], 'polygons_labels': ['The star located at the top left corner of the illustration. Small star with a bright yellow color and five points, slightly tilted to the left.']}}
Trying to draw the resulting polygon on an image results in horizontal lines.
from PIL import Image, ImageDraw, ImageFont
import random
import numpy as np
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
def draw_polygons(image, prediction, fill_mask=False):
"""
Draws segmentation masks with polygons on an image.
Parameters:
- image_path: Path to the image file.
- prediction: Dictionary containing 'polygons' and 'labels' keys.
'polygons' is a list of lists, each containing vertices of a polygon.
'labels' is a list of labels corresponding to each polygon.
- fill_mask: Boolean indicating whether to fill the polygons with color.
"""
# Load the image
draw = ImageDraw.Draw(image)
# Set up scale factor if needed (use 1 if not scaling)
scale = 1
# Iterate over polygons and labels
for polygons, label in zip(prediction['polygons'], prediction['polygons_labels']):
color = random.choice(colormap)
fill_color = random.choice(colormap) if fill_mask else None
for _polygon in polygons:
_polygon = np.array(_polygon).reshape(-1, 2)
if len(_polygon) < 3:
print('Invalid polygon:', _polygon)
continue
_polygon = (_polygon * scale).reshape(-1).tolist()
# Draw the polygon
if fill_mask:
draw.polygon(_polygon, outline=color, fill=fill_color)
else:
draw.polygon(_polygon, outline=color)
# Draw the label text
draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
# Save or display the image
#image.show() # Display the image
#display(image)
return image
image = resize_image_from_url(image_url)
# Draw annotations
annotated_image = draw_polygons(image, results["<OPEN_VOCABULARY_DETECTION>"])
annotated_image.show()
Maybe my text inputs are too complex, which leads to out-of-distribution issues?