Spaces:

XAI
/

PEEB

Running on Zero

File size: 19,670 Bytes

import os
import io

import torch
import json
import base64
import gradio as gr
import numpy as np
from pathlib import Path
from PIL import Image

from plots import get_pre_define_colors
from utils.load_model import load_xclip
from utils.predict import xclip_pred


#! Huggingface does not allow load model to main process, so we need to load the model when needed, it may not help in improve the speed of the app.
try:
    import spaces
    XCLIP, OWLVIT_PRECESSOR = None, None
    DEVICE = 'cuda'
except:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Not at Huggingface demo, load model to main process.")
    XCLIP, OWLVIT_PRECESSOR = load_xclip(DEVICE)

print(f"Device: {DEVICE}")

XCLIP_DESC_PATH = "data/jsons/bs_cub_desc.json"
XCLIP_DESC = json.load(open(XCLIP_DESC_PATH, "r"))
IMAGES_FOLDER = "data/images"
# XCLIP_RESULTS = json.load(open("data/jsons/xclip_org.json", "r"))
IMAGE2GT = json.load(open("data/jsons/image2gt.json", 'r'))
CUB_DESC_EMBEDS = torch.load('data/text_embeddings/cub_200_desc.pt')
CUB_IDX2NAME = json.load(open('data/jsons/cub_desc_idx2name.json', 'r'))
CUB_IDX2NAME = {int(k): v for k, v in CUB_IDX2NAME.items()}

IMAGE_FILE_LIST = json.load(open("data/jsons/file_list.json", "r"))
IMAGE_GALLERY = [Image.open(os.path.join(IMAGES_FOLDER, 'org', file_name)).convert('RGB') for file_name in IMAGE_FILE_LIST]

ORG_PART_ORDER = ['back', 'beak', 'belly', 'breast', 'crown', 'forehead', 'eyes', 'legs', 'wings', 'nape', 'tail', 'throat']
ORDERED_PARTS = ['crown', 'forehead', 'nape', 'eyes', 'beak', 'throat', 'breast', 'belly', 'back', 'wings', 'legs', 'tail']
COLORS = get_pre_define_colors(12, cmap_set=['Set2', 'tab10'])
SACHIT_COLOR = "#ADD8E6"
# CUB_BOXES = json.load(open("data/jsons/cub_boxes_owlvit_large.json", "r"))
VISIBILITY_DICT = json.load(open("data/jsons/cub_vis_dict_binary.json", 'r'))
VISIBILITY_DICT['Eastern_Bluebird.jpg'] = dict(zip(ORDERED_PARTS, [True]*12))

# --- Image related functions ---
def img_to_base64(img):
    img_pil = Image.fromarray(img) if isinstance(img, np.ndarray) else img
    buffered = io.BytesIO()
    img_pil.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue())
    return img_str.decode()

def create_blank_image(width=500, height=500, color=(255, 255, 255)):
    """Create a blank image of the given size and color."""
    return np.array(Image.new("RGB", (width, height), color))

# Convert RGB colors to hex
def rgb_to_hex(rgb):
    return f"#{''.join(f'{x:02x}' for x in rgb)}"

def load_part_images(file_name: str) -> dict:
    part_images = {}
    # start_time = time.time()
    for part_name in ORDERED_PARTS:
        base_name = Path(file_name).stem
        part_image_path = os.path.join(IMAGES_FOLDER, "boxes", f"{base_name}_{part_name}.jpg")
        if not Path(part_image_path).exists():
            continue
        image = np.array(Image.open(part_image_path))
        part_images[part_name] = img_to_base64(image)
    # print(f"Time cost to load 12 images: {time.time() - start_time}")
    # This takes less than 0.01 seconds. So the loading time is not the bottleneck.
    return part_images

def generate_xclip_explanations(result_dict:dict, visibility: dict, part_mask: dict = dict(zip(ORDERED_PARTS, [1]*12))):
    """
    The result_dict needs three keys: 'descriptions', 'pred_scores', 'file_name'
    descriptions: {part_name1: desc_1, part_name2: desc_2, ...}
    pred_scores: {part_name1: score_1, part_name2: score_2, ...}
    file_name: str
    """
    
    descriptions = result_dict['descriptions']
    image_name = result_dict['file_name']
    part_images = PART_IMAGES_DICT[image_name]
    MAX_LENGTH = 50
    exp_length = 400
    fontsize = 15

    # Start the SVG inside a div
    svg_parts = [f'<div style="width: {exp_length}px; height: 450px; background-color: white;">',
                 "<svg width=\"100%\" height=\"100%\">"]

    # Add a row for each visible bird part
    y_offset = 0
    for part in ORDERED_PARTS:
        if visibility[part] and part_mask[part]:
            # Calculate the length of the bar (scaled to fit within the SVG)
            part_score = max(result_dict['pred_scores'][part], 0)
            bar_length = part_score * exp_length

            # Modify the overlay image's opacity on mouseover and mouseout
            mouseover_action1 = f"document.getElementById('overlayImage').src = 'data:image/jpeg;base64,{part_images[part]}'; document.getElementById('overlayImage').style.opacity = 1;"
            mouseout_action1 = "document.getElementById('overlayImage').style.opacity = 0;"

            combined_mouseover = f"javascript: {mouseover_action1};"
            combined_mouseout = f"javascript: {mouseout_action1};"

            # Add the description
            num_lines = len(descriptions[part]) // MAX_LENGTH + 1
            for line in range(num_lines):
                desc_line = descriptions[part][line*MAX_LENGTH:(line+1)*MAX_LENGTH]
                y_offset += fontsize
                svg_parts.append(f"""
                <text x="0" y="{y_offset}" font-size="{fontsize}" 
                    onmouseover="{combined_mouseover}"
                    onmouseout="{combined_mouseout}">
                    {desc_line}
                </text>
                """)

            # Add the bars
            svg_parts.append(f"""
            <rect x="0" y="{y_offset +3}" width="{bar_length}" height="{fontsize*0.7}" fill="{PART_COLORS[part]}"
                onmouseover="{combined_mouseover}"
                onmouseout="{combined_mouseout}">
            </rect>
            """)
            # Add the scores
            svg_parts.append(f'<text x="{exp_length - 50}" y="{y_offset+fontsize+3}" font-size="{fontsize}" fill="{PART_COLORS[part]}">{part_score:.2f}</text>')

            y_offset += fontsize + 3
    svg_parts.extend(("</svg>", "</div>"))
    # Join everything into a single string
    html = "".join(svg_parts)


    return html



def generate_sachit_explanations(result_dict:dict):
    descriptions = result_dict['descriptions']
    scores = result_dict['scores']
    MAX_LENGTH = 50
    exp_length = 400
    fontsize = 15

    descriptions = zip(scores, descriptions)
    descriptions = sorted(descriptions, key=lambda x: x[0], reverse=True)

    # Start the SVG inside a div
    svg_parts = [f'<div style="width: {exp_length}px; height: 450px; background-color: white;">',
                 "<svg width=\"100%\" height=\"100%\">"]

    # Add a row for each visible bird part
    y_offset = 0
    for score, desc in descriptions:

        # Calculate the length of the bar (scaled to fit within the SVG)
        part_score = max(score, 0)
        bar_length = part_score * exp_length

        # Split the description into two lines if it's too long
        num_lines = len(desc) // MAX_LENGTH + 1
        for line in range(num_lines):
            desc_line = desc[line*MAX_LENGTH:(line+1)*MAX_LENGTH]
            y_offset += fontsize
            svg_parts.append(f"""
            <text x="0" y="{y_offset}" font-size="{fontsize}" fill="black">
                {desc_line}
            </text>
            """)

        # Add the bar
        svg_parts.append(f"""
        <rect x="0" y="{y_offset+3}" width="{bar_length}" height="{fontsize*0.7}" fill="{SACHIT_COLOR}">
        </rect>
        """)

        # Add the score
        svg_parts.append(f'<text x="{exp_length - 50}" y="{y_offset+fontsize+3}" font-size="fontsize" fill="{SACHIT_COLOR}">{part_score:.2f}</text>') # Added fill color

        y_offset += fontsize + 3


    svg_parts.extend(("</svg>", "</div>"))
    # Join everything into a single string
    html = "".join(svg_parts)


    return html

# --- Constants created by the functions above ---
BLANK_OVERLAY = img_to_base64(create_blank_image())
PART_COLORS = {part: rgb_to_hex(COLORS[i]) for i, part in enumerate(ORDERED_PARTS)}
blank_image = np.array(Image.open('data/images/final.png').convert('RGB'))
PART_IMAGES_DICT = {file_name: load_part_images(file_name) for file_name in IMAGE_FILE_LIST}

# --- Gradio Functions ---
def update_selected_image(event: gr.SelectData):
    image_height = 400
    index = event.index

    image_name = IMAGE_FILE_LIST[index]
    current_image.state = image_name
    org_image = Image.open(os.path.join(IMAGES_FOLDER, 'org', image_name)).convert('RGB')
    img_base64 = f"""
    <div style="position: relative; height: {image_height}px; display: inline-block;">
        <img id="birdImage" src="data:image/jpeg;base64,{img_to_base64(org_image)}" style="height: {image_height}px; width: auto;">
        <img id="overlayImage" src="data:image/jpeg;base64,{BLANK_OVERLAY}" style="position:absolute; top:0; left:0; width:auto; height: {image_height}px; opacity: 0;">
    </div>
    """
    gt_label = IMAGE2GT[image_name]
    gt_class.state = gt_label

    # --- for initial value only ---
    out_dict = xclip_pred(new_desc=None, 
                          new_part_mask=None, 
                          new_class=None, 
                          org_desc=XCLIP_DESC_PATH, 
                          image=Image.open(os.path.join(IMAGES_FOLDER, 'org', current_image.state)).convert('RGB'), 
                          model=XCLIP, 
                          owlvit_processor=OWLVIT_PRECESSOR, 
                          device=DEVICE, 
                          image_name=current_image.state,
                          cub_embeds=CUB_DESC_EMBEDS,
                          cub_idx2name=CUB_IDX2NAME,
                          descriptors=XCLIP_DESC)
    xclip_label = out_dict['pred_class']
    clip_pred_scores = out_dict['pred_score']
    xclip_part_scores = out_dict['pred_desc_scores']
    result_dict = {'descriptions': dict(zip(ORG_PART_ORDER, out_dict["descriptions"])), 'pred_scores': xclip_part_scores, 'file_name': current_image.state}
    xclip_exp = generate_xclip_explanations(result_dict, VISIBILITY_DICT[current_image.state], part_mask=dict(zip(ORDERED_PARTS, [1]*12)))
    # --- end of intial value ---
    
    xclip_color = "green" if xclip_label.strip() == gt_label.strip() else "red"
    xclip_pred_markdown = f"""
        ### <span style='color:{xclip_color}'>XCLIP: {xclip_label} &nbsp;&nbsp;&nbsp; {clip_pred_scores:.4f}</span>
    """

    gt_label = f"""
        ## {gt_label}
    """
    current_predicted_class.state = xclip_label
    
    # Populate the textbox with current descriptions
    custom_class_name = "class name: custom"
    descs = XCLIP_DESC[xclip_label]
    descs = {k: descs[i] for i, k in enumerate(ORG_PART_ORDER)}
    descs = {k: descs[k] for k in ORDERED_PARTS}
    custom_text = [custom_class_name] + list(descs.values())
    descriptions = ";\n".join(custom_text)
    # textbox = gr.Textbox.update(value=descriptions, lines=12, visible=True, label="XCLIP descriptions", interactive=True, info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}', show_label=False)
    textbox = gr.Textbox(value=descriptions, 
                     lines=12, 
                     visible=True, 
                     label="XCLIP descriptions", 
                     interactive=True, 
                     info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}', 
                     show_label=False)
    # modified_exp = gr.HTML().update(value="", visible=True)
    return gt_label, img_base64, xclip_pred_markdown, xclip_exp, current_image, textbox

def on_edit_button_click_xclip():
    # empty_exp = gr.HTML.update(visible=False)
    empty_exp = gr.HTML(visible=False)

    # Populate the textbox with current descriptions
    descs = XCLIP_DESC[current_predicted_class.state]
    descs = {k: descs[i] for i, k in enumerate(ORG_PART_ORDER)}
    descs = {k: descs[k] for k in ORDERED_PARTS}
    custom_text = ["class name: custom"] + list(descs.values())
    descriptions = ";\n".join(custom_text)
    # textbox = gr.Textbox.update(value=descriptions, lines=12, visible=True, label="XCLIP descriptions", interactive=True, info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}', show_label=False)
    textbox = gr.Textbox(value=descriptions,
                         lines=12,
                            visible=True,
                            label="XCLIP descriptions",
                            interactive=True,
                            info='Please use ";" to separate the descriptions for each part, and keep the format of {part name}: {descriptions}',
                            show_label=False)
    
    return textbox, empty_exp

def convert_input_text_to_xclip_format(textbox_input: str):

    # Split the descriptions by newline to get individual descriptions for each part
    descriptions_list = textbox_input.split(";\n")
    # the first line should be "class name: xxx"
    class_name_line = descriptions_list[0]
    new_class_name = class_name_line.split(":")[1].strip()
    
    descriptions_list = descriptions_list[1:]
    
    # construct descripion dict with part name as key
    descriptions_dict = {}
    for desc in descriptions_list:
        if desc.strip() == "":
            continue
        part_name, _ = desc.split(":")
        descriptions_dict[part_name.strip()] = desc
    # fill with empty string if the part is not in the descriptions
    part_mask = {}
    for part in ORDERED_PARTS:
        if part not in descriptions_dict:
            descriptions_dict[part] = ""
            part_mask[part] = 0
        else:
            part_mask[part] = 1
    return descriptions_dict, part_mask, new_class_name

def on_predict_button_click_xclip(textbox_input: str):
    descriptions_dict, part_mask, new_class_name = convert_input_text_to_xclip_format(textbox_input)
    
    # Get the new predictions and explanations
    out_dict = xclip_pred(new_desc=descriptions_dict, 
                          new_part_mask=part_mask, 
                          new_class=new_class_name, 
                          org_desc=XCLIP_DESC_PATH, 
                          image=Image.open(os.path.join(IMAGES_FOLDER, 'org', current_image.state)).convert('RGB'), 
                          model=XCLIP, 
                          owlvit_processor=OWLVIT_PRECESSOR, 
                          device=DEVICE, 
                          image_name=current_image.state,
                          cub_embeds=CUB_DESC_EMBEDS,
                          cub_idx2name=CUB_IDX2NAME,
                          descriptors=XCLIP_DESC)
    xclip_label = out_dict['pred_class']
    xclip_pred_score = out_dict['pred_score']
    xclip_part_scores = out_dict['pred_desc_scores']
    custom_label = out_dict['modified_class']
    custom_pred_score = out_dict['modified_score']
    custom_part_scores = out_dict['modified_desc_scores']

    # construct a result dict to generate xclip explanations
    result_dict = {'descriptions': dict(zip(ORG_PART_ORDER, out_dict["descriptions"])), 'pred_scores': xclip_part_scores, 'file_name': current_image.state}
    xclip_explanation = generate_xclip_explanations(result_dict, VISIBILITY_DICT[current_image.state], part_mask)
    modified_result_dict = {'descriptions': dict(zip(ORG_PART_ORDER, out_dict["modified_descriptions"])), 'pred_scores': custom_part_scores, 'file_name': current_image.state}
    modified_explanation = generate_xclip_explanations(modified_result_dict, VISIBILITY_DICT[current_image.state], part_mask)

    xclip_color = "green" if xclip_label.strip() == gt_class.state.strip() else "red"
    xclip_pred_markdown = f"""
        ### <span style='color:{xclip_color}'> {xclip_label} &nbsp;&nbsp;&nbsp; {xclip_pred_score:.4f}</span>
    """
    custom_color = "green" if custom_label.strip() == gt_class.state.strip() else "red"
    custom_pred_markdown = f"""
        ### <span style='color:{custom_color}'> {custom_label} &nbsp;&nbsp;&nbsp; {custom_pred_score:.4f}</span>
    """
    # textbox = gr.Textbox.update(visible=False)
    textbox = gr.Textbox(visible=False)
    # return textbox, xclip_pred_markdown, xclip_explanation, custom_pred_markdown, modified_explanation
    
    # modified_exp = gr.HTML().update(value=modified_explanation, visible=True)
    modified_exp = gr.HTML(value=modified_explanation, visible=True)
    return textbox, xclip_pred_markdown, xclip_explanation, custom_pred_markdown, modified_exp


custom_css = """
        html, body {
            margin: 0;
            padding: 0;
        }

        #container {
            position: relative;
            width: 400px;
            height: 400px;
            border: 1px solid #000;
            margin: 0 auto; /* This will center the container horizontally */
        }

        #canvas {
            position: absolute;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            object-fit: cover;
        }

"""

# Define the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="PEEB") as demo:
    current_image = gr.State("")
    current_predicted_class = gr.State("")
    gt_class = gr.State("")
    
    with gr.Column():
        title_text = gr.Markdown("# PEEB - demo")
        gr.Markdown(
            """
            - In this demo a demo for PEEB paper (NAACL finding 2024). 
            - paper: https://arxiv.org/abs/2403.05297
            - code: https://github.com/anguyen8/peeb/tree/inspect_ddp
            """
        )

    # display the gallery of images
    with gr.Column():
        
        gr.Markdown("## Select an image to start!")
        image_gallery = gr.Gallery(value=IMAGE_GALLERY, label=None, preview=False, allow_preview=False, columns=10, height=250)
        gr.Markdown("### Custom descritions: \n The first row should be **class name: {some name};**, where you can name your descriptions. \n For the remianing descriptions, please use **;** to separate the descriptions for each part, and use the format **{part name}: {descriptions}**. \n Note that you can delete a part completely, in such cases, all descriptions will remove the corresponding part.")
        
        with gr.Row():
            with gr.Column():
                image_label = gr.Markdown("### Class Name")
                org_image = gr.HTML()
            
            with gr.Column():
                with gr.Row():
                    # xclip_predict_button = gr.Button(label="Predict", value="Predict")
                    xclip_predict_button = gr.Button(value="Predict")
                xclip_pred_label = gr.Markdown("### PEEB:")
                xclip_explanation = gr.HTML()

            with gr.Column():
                # xclip_edit_button = gr.Button(label="Edit", value="Reset Descriptions")
                xclip_edit_button = gr.Button(value="Reset Descriptions")
                custom_pred_label = gr.Markdown(
                    "### Custom Descritpions:"
                )
                xclip_textbox = gr.Textbox(lines=12, placeholder="Edit the descriptions here", visible=False)
                # ai_explanation = gr.Image(type="numpy", visible=True, show_label=False, height=500)
                custom_explanation = gr.HTML()

    gr.HTML("<br>")

    image_gallery.select(update_selected_image, inputs=None, outputs=[image_label, org_image, xclip_pred_label, xclip_explanation, current_image, xclip_textbox])
    xclip_edit_button.click(on_edit_button_click_xclip, inputs=[], outputs=[xclip_textbox, custom_explanation])
    xclip_predict_button.click(on_predict_button_click_xclip, inputs=[xclip_textbox], outputs=[xclip_textbox, xclip_pred_label, xclip_explanation, custom_pred_label, custom_explanation])

demo.launch()