import gradio as gr import numpy as np from CLIP.clip import ClipWrapper, saliency_configs from time import time from matplotlib import pyplot as plt import io from PIL import Image, ImageDraw, ImageFont def plot_to_png(fig): buf = io.BytesIO() plt.savefig(buf, format="png") buf.seek(0) img = np.array(Image.open(buf)).astype(np.uint8) return img def add_text_to_image( image: np.ndarray, text, position, color="rgb(255, 255, 255)", fontsize=60, ): image = Image.fromarray(image) draw = ImageDraw.Draw(image) draw.text( position, text, fill=color, font=ImageFont.truetype( "/usr/share/fonts/truetype/lato/Lato-Medium.ttf", fontsize ), ) return np.array(image) def generate_relevancy( img: np.array, labels: str, prompt: str, saliency_config: str, subtract_mean: bool ): labels = labels.split(",") prompts = [prompt] img = np.asarray(Image.fromarray(img).resize((244 * 4, 244 * 4))) assert img.dtype == np.uint8 h, w, c = img.shape grads = ClipWrapper.get_clip_saliency( img=img, text_labels=np.array(labels), prompts=prompts, **saliency_configs[saliency_config](h), )[0] if subtract_mean: grads -= grads.mean(axis=0) grads = grads.cpu().numpy() vmin = 0.002 cmap = plt.get_cmap("jet") vmax = 0.008 returns = [] for label_grad, label in zip(grads, labels): fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.axis("off") ax.imshow(img) grad = np.clip((label_grad - vmin) / (vmax - vmin), a_min=0.0, a_max=1.0) colored_grad = cmap(grad) grad = 1 - grad colored_grad[..., -1] = grad * 0.7 colored_grad = add_text_to_image( (colored_grad * 255).astype(np.uint8), text=label, position=(0, 0) ) colored_grad = colored_grad.astype(float) / 255 ax.imshow(colored_grad) plt.tight_layout(pad=0) returns.append(plot_to_png(fig)) plt.close(fig) return returns iface = gr.Interface( title="Semantic Abstraction Multi-scale Relevancy Extractor", description="""A CPU-only demo of [Semantic Abstraction](https://semantic-abstraction.cs.columbia.edu/)'s Multi-Scale Relevancy Extractor. To run GPU inference locally, use the [official codebase release](https://github.com/columbia-ai-robotics/semantic-abstraction). This relevancy extractor builds heavily on [Chefer et al.'s codebase](https://github.com/hila-chefer/Transformer-MM-Explainability) and [CLIP on Wheels' codebase](https://cow.cs.columbia.edu/).""", fn=generate_relevancy, cache_examples=False, inputs=[ gr.Image(type="numpy", label="Image"), gr.Textbox(label="Labels (comma separated)"), gr.Textbox(label="Prompt"), gr.Dropdown( value="ours", choices=["ours", "ours_fast", "chefer_et_al"], label="Relevancy Configuration", ), gr.Checkbox(value=True, label="subtract mean"), ], outputs=gr.Gallery(label="Relevancy Maps", type="numpy"), examples=[ [ "https://semantic-abstraction.cs.columbia.edu/downloads/gameroom.png", "basketball jersey,nintendo switch,television,ping pong table,vase,fireplace,abstract painting of a vespa,carpet,wall", "a photograph of a {} in a home.", "ours_fast", True, ], [ "https://semantic-abstraction.cs.columbia.edu/downloads/livingroom.png", "monopoly boardgame set,door knob,sofa,coffee table,plant,carpet,wall", "a photograph of a {} in a home.", "ours_fast", True, ], [ "https://semantic-abstraction.cs.columbia.edu/downloads/fireplace.png", "fireplace,beige armchair,candle,large indoor plant in a pot,forest painting,cheetah-patterned pillow,floor,carpet,wall", "a photograph of a {} in a home.", "ours_fast", True, ], [ "https://semantic-abstraction.cs.columbia.edu/downloads/walle.png", "WALL-E,a fire extinguisher", "a 3D render of {}.", "ours_fast", True, ], ], ) # iface.launch(share=True) iface.launch()