Spaces:
Sleeping
Sleeping
File size: 19,670 Bytes
a5ea546 711211a 395d6df 711211a 1fb7158 711211a e1c03cb 711211a f80b905 e1c03cb f80b905 e1c03cb f80b905 e1c03cb 711211a 66ba241 e1c03cb 395d6df 711211a 5f0290e 711211a 66ba241 711211a 395d6df 711211a 92ef913 711211a 92ef913 711211a 92ef913 711211a 395d6df 711211a a209c09 711211a a209c09 711211a 92ef913 711211a 92ef913 711211a a209c09 711211a 66ba241 711211a d323ecd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 |
import os
import io
import torch
import json
import base64
import gradio as gr
import numpy as np
from pathlib import Path
from PIL import Image
from plots import get_pre_define_colors
from utils.load_model import load_xclip
from utils.predict import xclip_pred
#! Huggingface does not allow load model to main process, so we need to load the model when needed, it may not help in improve the speed of the app.
try:
import spaces
XCLIP, OWLVIT_PRECESSOR = None, None
DEVICE = 'cuda'
except:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Not at Huggingface demo, load model to main process.")
XCLIP, OWLVIT_PRECESSOR = load_xclip(DEVICE)
print(f"Device: {DEVICE}")
XCLIP_DESC_PATH = "data/jsons/bs_cub_desc.json"
XCLIP_DESC = json.load(open(XCLIP_DESC_PATH, "r"))
IMAGES_FOLDER = "data/images"
# XCLIP_RESULTS = json.load(open("data/jsons/xclip_org.json", "r"))
IMAGE2GT = json.load(open("data/jsons/image2gt.json", 'r'))
CUB_DESC_EMBEDS = torch.load('data/text_embeddings/cub_200_desc.pt')
CUB_IDX2NAME = json.load(open('data/jsons/cub_desc_idx2name.json', 'r'))
CUB_IDX2NAME = {int(k): v for k, v in CUB_IDX2NAME.items()}
IMAGE_FILE_LIST = json.load(open("data/jsons/file_list.json", "r"))
IMAGE_GALLERY = [Image.open(os.path.join(IMAGES_FOLDER, 'org', file_name)).convert('RGB') for file_name in IMAGE_FILE_LIST]
ORG_PART_ORDER = ['back', 'beak', 'belly', 'breast', 'crown', 'forehead', 'eyes', 'legs', 'wings', 'nape', 'tail', 'throat']
ORDERED_PARTS = ['crown', 'forehead', 'nape', 'eyes', 'beak', 'throat', 'breast', 'belly', 'back', 'wings', 'legs', 'tail']
COLORS = get_pre_define_colors(12, cmap_set=['Set2', 'tab10'])
SACHIT_COLOR = "#ADD8E6"
# CUB_BOXES = json.load(open("data/jsons/cub_boxes_owlvit_large.json", "r"))
VISIBILITY_DICT = json.load(open("data/jsons/cub_vis_dict_binary.json", 'r'))
VISIBILITY_DICT['Eastern_Bluebird.jpg'] = dict(zip(ORDERED_PARTS, [True]*12))
# --- Image related functions ---
def img_to_base64(img):
img_pil = Image.fromarray(img) if isinstance(img, np.ndarray) else img
buffered = io.BytesIO()
img_pil.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue())
return img_str.decode()
def create_blank_image(width=500, height=500, color=(255, 255, 255)):
"""Create a blank image of the given size and color."""
return np.array(Image.new("RGB", (width, height), color))
# Convert RGB colors to hex
def rgb_to_hex(rgb):
return f"#{''.join(f'{x:02x}' for x in rgb)}"
def load_part_images(file_name: str) -> dict:
part_images = {}
# start_time = time.time()
for part_name in ORDERED_PARTS:
base_name = Path(file_name).stem
part_image_path = os.path.join(IMAGES_FOLDER, "boxes", f"{base_name}_{part_name}.jpg")
if not Path(part_image_path).exists():
continue
image = np.array(Image.open(part_image_path))
part_images[part_name] = img_to_base64(image)
# print(f"Time cost to load 12 images: {time.time() - start_time}")
# This takes less than 0.01 seconds. So the loading time is not the bottleneck.
return part_images
def generate_xclip_explanations(result_dict:dict, visibility: dict, part_mask: dict = dict(zip(ORDERED_PARTS, [1]*12))):
"""
The result_dict needs three keys: 'descriptions', 'pred_scores', 'file_name'
descriptions: {part_name1: desc_1, part_name2: desc_2, ...}
pred_scores: {part_name1: score_1, part_name2: score_2, ...}
file_name: str
"""
descriptions = result_dict['descriptions']
image_name = result_dict['file_name']
part_images = PART_IMAGES_DICT[image_name]
MAX_LENGTH = 50
exp_length = 400
fontsize = 15
# Start the SVG inside a div
svg_parts = [f'<div style="width: {exp_length}px; height: 450px; background-color: white;">',
"<svg width=\"100%\" height=\"100%\">"]
# Add a row for each visible bird part
y_offset = 0
for part in ORDERED_PARTS:
if visibility[part] and part_mask[part]:
# Calculate the length of the bar (scaled to fit within the SVG)
part_score = max(result_dict['pred_scores'][part], 0)
bar_length = part_score * exp_length
# Modify the overlay image's opacity on mouseover and mouseout
mouseover_action1 = f"document.getElementById('overlayImage').src = 'data:image/jpeg;base64,{part_images[part]}'; document.getElementById('overlayImage').style.opacity = 1;"
mouseout_action1 = "document.getElementById('overlayImage').style.opacity = 0;"
combined_mouseover = f"javascript: {mouseover_action1};"
combined_mouseout = f"javascript: {mouseout_action1};"
# Add the description
num_lines = len(descriptions[part]) // MAX_LENGTH + 1
for line in range(num_lines):
desc_line = descriptions[part][line*MAX_LENGTH:(line+1)*MAX_LENGTH]
y_offset += fontsize
svg_parts.append(f"""
<text x="0" y="{y_offset}" font-size="{fontsize}"
onmouseover="{combined_mouseover}"
onmouseout="{combined_mouseout}">
{desc_line}
</text>
""")
# Add the bars
svg_parts.append(f"""
<rect x="0" y="{y_offset +3}" width="{bar_length}" height="{fontsize*0.7}" fill="{PART_COLORS[part]}"
onmouseover="{combined_mouseover}"
onmouseout="{combined_mouseout}">
</rect>
""")
# Add the scores
svg_parts.append(f'<text x="{exp_length - 50}" y="{y_offset+fontsize+3}" font-size="{fontsize}" fill="{PART_COLORS[part]}">{part_score:.2f}</text>')
y_offset += fontsize + 3
svg_parts.extend(("</svg>", "</div>"))
# Join everything into a single string
html = "".join(svg_parts)
return html
def generate_sachit_explanations(result_dict:dict):
descriptions = result_dict['descriptions']
scores = result_dict['scores']
MAX_LENGTH = 50
exp_length = 400
fontsize = 15
descriptions = zip(scores, descriptions)
descriptions = sorted(descriptions, key=lambda x: x[0], reverse=True)
# Start the SVG inside a div
svg_parts = [f'<div style="width: {exp_length}px; height: 450px; background-color: white;">',
"<svg width=\"100%\" height=\"100%\">"]
# Add a row for each visible bird part
y_offset = 0
for score, desc in descriptions:
# Calculate the length of the bar (scaled to fit within the SVG)
part_score = max(score, 0)
bar_length = part_score * exp_length
# Split the description into two lines if it's too long
num_lines = len(desc) // MAX_LENGTH + 1
for line in range(num_lines):
desc_line = desc[line*MAX_LENGTH:(line+1)*MAX_LENGTH]
y_offset += fontsize
svg_parts.append(f"""
<text x="0" y="{y_offset}" font-size="{fontsize}" fill="black">
{desc_line}
</text>
""")
# Add the bar
svg_parts.append(f"""
<rect x="0" y="{y_offset+3}" width="{bar_length}" height="{fontsize*0.7}" fill="{SACHIT_COLOR}">
</rect>
""")
# Add the score
svg_parts.append(f'<text x="{exp_length - 50}" y="{y_offset+fontsize+3}" font-size="fontsize" fill="{SACHIT_COLOR}">{part_score:.2f}</text>') # Added fill color
y_offset += fontsize + 3
svg_parts.extend(("</svg>", "</div>"))
# Join everything into a single string
html = "".join(svg_parts)
return html
# --- Constants created by the functions above ---
BLANK_OVERLAY = img_to_base64(create_blank_image())
PART_COLORS = {part: rgb_to_hex(COLORS[i]) for i, part in enumerate(ORDERED_PARTS)}
blank_image = np.array(Image.open('data/images/final.png').convert('RGB'))
PART_IMAGES_DICT = {file_name: load_part_images(file_name) for file_name in IMAGE_FILE_LIST}
# --- Gradio Functions ---
def update_selected_image(event: gr.SelectData):
image_height = 400
index = event.index
image_name = IMAGE_FILE_LIST[index]
current_image.state = image_name
org_image = Image.open(os.path.join(IMAGES_FOLDER, 'org', image_name)).convert('RGB')
img_base64 = f"""
<div style="position: relative; height: {image_height}px; display: inline-block;">
<img id="birdImage" src="data:image/jpeg;base64,{img_to_base64(org_image)}" style="height: {image_height}px; width: auto;">
<img id="overlayImage" src="data:image/jpeg;base64,{BLANK_OVERLAY}" style="position:absolute; top:0; left:0; width:auto; height: {image_height}px; opacity: 0;">
</div>
"""
gt_label = IMAGE2GT[image_name]
gt_class.state = gt_label
# --- for initial value only ---
out_dict = xclip_pred(new_desc=None,
new_part_mask=None,
new_class=None,
org_desc=XCLIP_DESC_PATH,
image=Image.open(os.path.join(IMAGES_FOLDER, 'org', current_image.state)).convert('RGB'),
model=XCLIP,
owlvit_processor=OWLVIT_PRECESSOR,
device=DEVICE,
image_name=current_image.state,
cub_embeds=CUB_DESC_EMBEDS,
cub_idx2name=CUB_IDX2NAME,
descriptors=XCLIP_DESC)
xclip_label = out_dict['pred_class']
clip_pred_scores = out_dict['pred_score']
xclip_part_scores = out_dict['pred_desc_scores']
result_dict = {'descriptions': dict(zip(ORG_PART_ORDER, out_dict["descriptions"])), 'pred_scores': xclip_part_scores, 'file_name': current_image.state}
xclip_exp = generate_xclip_explanations(result_dict, VISIBILITY_DICT[current_image.state], part_mask=dict(zip(ORDERED_PARTS, [1]*12)))
# --- end of intial value ---
xclip_color = "green" if xclip_label.strip() == gt_label.strip() else "red"
xclip_pred_markdown = f"""
### <span style='color:{xclip_color}'>XCLIP: {xclip_label} {clip_pred_scores:.4f}</span>
"""
gt_label = f"""
## {gt_label}
"""
current_predicted_class.state = xclip_label
# Populate the textbox with current descriptions
custom_class_name = "class name: custom"
descs = XCLIP_DESC[xclip_label]
descs = {k: descs[i] for i, k in enumerate(ORG_PART_ORDER)}
descs = {k: descs[k] for k in ORDERED_PARTS}
custom_text = [custom_class_name] + list(descs.values())
descriptions = ";\n".join(custom_text)
# textbox = gr.Textbox.update(value=descriptions, lines=12, visible=True, label="XCLIP descriptions", interactive=True, info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}', show_label=False)
textbox = gr.Textbox(value=descriptions,
lines=12,
visible=True,
label="XCLIP descriptions",
interactive=True,
info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}',
show_label=False)
# modified_exp = gr.HTML().update(value="", visible=True)
return gt_label, img_base64, xclip_pred_markdown, xclip_exp, current_image, textbox
def on_edit_button_click_xclip():
# empty_exp = gr.HTML.update(visible=False)
empty_exp = gr.HTML(visible=False)
# Populate the textbox with current descriptions
descs = XCLIP_DESC[current_predicted_class.state]
descs = {k: descs[i] for i, k in enumerate(ORG_PART_ORDER)}
descs = {k: descs[k] for k in ORDERED_PARTS}
custom_text = ["class name: custom"] + list(descs.values())
descriptions = ";\n".join(custom_text)
# textbox = gr.Textbox.update(value=descriptions, lines=12, visible=True, label="XCLIP descriptions", interactive=True, info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}', show_label=False)
textbox = gr.Textbox(value=descriptions,
lines=12,
visible=True,
label="XCLIP descriptions",
interactive=True,
info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}',
show_label=False)
return textbox, empty_exp
def convert_input_text_to_xclip_format(textbox_input: str):
# Split the descriptions by newline to get individual descriptions for each part
descriptions_list = textbox_input.split(";\n")
# the first line should be "class name: xxx"
class_name_line = descriptions_list[0]
new_class_name = class_name_line.split(":")[1].strip()
descriptions_list = descriptions_list[1:]
# construct descripion dict with part name as key
descriptions_dict = {}
for desc in descriptions_list:
if desc.strip() == "":
continue
part_name, _ = desc.split(":")
descriptions_dict[part_name.strip()] = desc
# fill with empty string if the part is not in the descriptions
part_mask = {}
for part in ORDERED_PARTS:
if part not in descriptions_dict:
descriptions_dict[part] = ""
part_mask[part] = 0
else:
part_mask[part] = 1
return descriptions_dict, part_mask, new_class_name
def on_predict_button_click_xclip(textbox_input: str):
descriptions_dict, part_mask, new_class_name = convert_input_text_to_xclip_format(textbox_input)
# Get the new predictions and explanations
out_dict = xclip_pred(new_desc=descriptions_dict,
new_part_mask=part_mask,
new_class=new_class_name,
org_desc=XCLIP_DESC_PATH,
image=Image.open(os.path.join(IMAGES_FOLDER, 'org', current_image.state)).convert('RGB'),
model=XCLIP,
owlvit_processor=OWLVIT_PRECESSOR,
device=DEVICE,
image_name=current_image.state,
cub_embeds=CUB_DESC_EMBEDS,
cub_idx2name=CUB_IDX2NAME,
descriptors=XCLIP_DESC)
xclip_label = out_dict['pred_class']
xclip_pred_score = out_dict['pred_score']
xclip_part_scores = out_dict['pred_desc_scores']
custom_label = out_dict['modified_class']
custom_pred_score = out_dict['modified_score']
custom_part_scores = out_dict['modified_desc_scores']
# construct a result dict to generate xclip explanations
result_dict = {'descriptions': dict(zip(ORG_PART_ORDER, out_dict["descriptions"])), 'pred_scores': xclip_part_scores, 'file_name': current_image.state}
xclip_explanation = generate_xclip_explanations(result_dict, VISIBILITY_DICT[current_image.state], part_mask)
modified_result_dict = {'descriptions': dict(zip(ORG_PART_ORDER, out_dict["modified_descriptions"])), 'pred_scores': custom_part_scores, 'file_name': current_image.state}
modified_explanation = generate_xclip_explanations(modified_result_dict, VISIBILITY_DICT[current_image.state], part_mask)
xclip_color = "green" if xclip_label.strip() == gt_class.state.strip() else "red"
xclip_pred_markdown = f"""
### <span style='color:{xclip_color}'> {xclip_label} {xclip_pred_score:.4f}</span>
"""
custom_color = "green" if custom_label.strip() == gt_class.state.strip() else "red"
custom_pred_markdown = f"""
### <span style='color:{custom_color}'> {custom_label} {custom_pred_score:.4f}</span>
"""
# textbox = gr.Textbox.update(visible=False)
textbox = gr.Textbox(visible=False)
# return textbox, xclip_pred_markdown, xclip_explanation, custom_pred_markdown, modified_explanation
# modified_exp = gr.HTML().update(value=modified_explanation, visible=True)
modified_exp = gr.HTML(value=modified_explanation, visible=True)
return textbox, xclip_pred_markdown, xclip_explanation, custom_pred_markdown, modified_exp
custom_css = """
html, body {
margin: 0;
padding: 0;
}
#container {
position: relative;
width: 400px;
height: 400px;
border: 1px solid #000;
margin: 0 auto; /* This will center the container horizontally */
}
#canvas {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
object-fit: cover;
}
"""
# Define the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="PEEB") as demo:
current_image = gr.State("")
current_predicted_class = gr.State("")
gt_class = gr.State("")
with gr.Column():
title_text = gr.Markdown("# PEEB - demo")
gr.Markdown(
"""
- In this demo a demo for PEEB paper (NAACL finding 2024).
- paper: https://arxiv.org/abs/2403.05297
- code: https://github.com/anguyen8/peeb/tree/inspect_ddp
"""
)
# display the gallery of images
with gr.Column():
gr.Markdown("## Select an image to start!")
image_gallery = gr.Gallery(value=IMAGE_GALLERY, label=None, preview=False, allow_preview=False, columns=10, height=250)
gr.Markdown("### Custom descritions: \n The first row should be **class name: {some name};**, where you can name your descriptions. \n For the remianing descriptions, please use **;** to separate the descriptions for each part, and use the format **{part name}: {descriptions}**. \n Note that you can delete a part completely, in such cases, all descriptions will remove the corresponding part.")
with gr.Row():
with gr.Column():
image_label = gr.Markdown("### Class Name")
org_image = gr.HTML()
with gr.Column():
with gr.Row():
# xclip_predict_button = gr.Button(label="Predict", value="Predict")
xclip_predict_button = gr.Button(value="Predict")
xclip_pred_label = gr.Markdown("### PEEB:")
xclip_explanation = gr.HTML()
with gr.Column():
# xclip_edit_button = gr.Button(label="Edit", value="Reset Descriptions")
xclip_edit_button = gr.Button(value="Reset Descriptions")
custom_pred_label = gr.Markdown(
"### Custom Descritpions:"
)
xclip_textbox = gr.Textbox(lines=12, placeholder="Edit the descriptions here", visible=False)
# ai_explanation = gr.Image(type="numpy", visible=True, show_label=False, height=500)
custom_explanation = gr.HTML()
gr.HTML("<br>")
image_gallery.select(update_selected_image, inputs=None, outputs=[image_label, org_image, xclip_pred_label, xclip_explanation, current_image, xclip_textbox])
xclip_edit_button.click(on_edit_button_click_xclip, inputs=[], outputs=[xclip_textbox, custom_explanation])
xclip_predict_button.click(on_predict_button_click_xclip, inputs=[xclip_textbox], outputs=[xclip_textbox, xclip_pred_label, xclip_explanation, custom_pred_label, custom_explanation])
demo.launch() |