sayakpaul's picture
sayakpaul HF staff
debugging
83c742a
raw
history blame
4.09 kB
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import timm
import torch
from timm import create_model
from timm.models.layers import PatchEmbed
from torchvision.models.feature_extraction import create_feature_extractor
from torchvision.transforms import functional as F
import glob
CAIT_MODEL = create_model("cait_xxs24_224.fb_dist_in1k", pretrained=True).eval()
TRANSFORM = timm.data.create_transform(
**timm.data.resolve_data_config(CAIT_MODEL.pretrained_cfg)
)
PATCH_SIZE = 16
def create_attn_extractor(block_id=0):
"""Creates a model that produces the softmax attention scores.
References:
https://github.com/huggingface/pytorch-image-models/discussions/926
"""
feature_extractor = create_feature_extractor(
CAIT_MODEL,
return_nodes=[f"blocks_token_only.{block_id}.attn.softmax"],
tracer_kwargs={"leaf_modules": [PatchEmbed]},
)
return feature_extractor
def get_cls_attention_map(
image, attn_score_dict, block_key="blocks_token_only.0.attn.softmax"
):
"""Prepares attention maps so that they can be visualized."""
w_featmap = image.shape[3] // PATCH_SIZE
h_featmap = image.shape[2] // PATCH_SIZE
attention_scores = attn_score_dict[block_key]
nh = attention_scores.shape[1] # Number of attention heads.
# Taking the representations from CLS token.
attentions = attention_scores[0, :, 0, 1:].reshape(nh, -1)
print(attentions.shape)
# Reshape the attention scores to resemble mini patches.
attentions = attentions.reshape(nh, w_featmap, h_featmap)
print(attentions.shape)
# Resize the attention patches to 224x224 (224: 14x16)
attentions = F.resize(
attentions,
size=(h_featmap * PATCH_SIZE, w_featmap * PATCH_SIZE),
interpolation=3,
)
print(attentions.shape)
return attentions
def generate_plot(processed_map):
"""Generates a class attention map plot."""
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(13, 13))
img_count = 0
for i in range(processed_map.shape[0]):
if img_count < processed_map.shape[0]:
axes[i].imshow(processed_map[img_count].numpy())
axes[i].title.set_text(f"Attention head: {img_count}")
axes[i].axis("off")
img_count += 1
fig.tight_layout()
return fig
def serialize_images(processed_map):
"""Serializes attention maps."""
for i in range(processed_map.shape[0]):
plt.imshow(processed_map[i].numpy())
plt.title(f"Attention head: {i}", fontsize=14)
plt.axis("off")
plt.savefig(fname="attention_map_{i}.png")
def generate_class_attn_map(image, block_id=0):
"""Collates the above utilities together for generating
a class attention map."""
image_tensor = TRANSFORM(image).unsqueeze(0)
feature_extractor = create_attn_extractor(block_id)
with torch.no_grad():
out = feature_extractor(image_tensor)
block_key = f"blocks_token_only.{block_id}.attn.softmax"
processed_cls_attn_map = get_cls_attention_map(image_tensor, out, block_key)
serialize_images(processed_cls_attn_map)
all_attn_img_paths = sorted(glob.glob("*.png"))
print(f"Number of images: {len(all_attn_img_paths)}")
return all_attn_img_paths
title = "Class Attention Maps"
article = "Class attention maps as investigated in [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239) (Touvron et al.). We use the [cait_xxs24_224](https://huggingface.co/timm/cait_xxs24_224.fb_dist_in1k) variant of CaiT. One can find all the other variants [here](https://huggingface.co/models?search=cait)."
iface = gr.Interface(
generate_class_attn_map,
inputs=[
gr.inputs.Image(type="pil", label="Input Image"),
gr.Slider(0, 1, value=0, step=1, label="Block ID", info="Transformer Block ID"),
],
outputs=gr.Gallery().style(grid=[2], height="auto"),
title=title,
article=article,
allow_flagging="never",
cache_examples=True,
examples=[["./bird.png", 0]],
)
iface.launch(debug=True)