Spaces:

samuelstevens
/

saev-semantic-segmentation

Running

App Files Files Community

Samuel Stevens commited on Feb 2

Commit

dc20bdb

0 Parent(s):

initial commit

Browse files

Files changed (6) hide show

.python-version +1 -0
README.md +0 -0
app.py +512 -0
data.py +0 -0
justfile +9 -0
pyproject.toml +20 -0

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,512 @@

+import os.path
+import typing
+import functools
+import beartype
+import einops
+import einops.layers.torch
+import gradio as gr
+import torch
+from jaxtyping import Float, Int, UInt8, jaxtyped
+from PIL import Image
+from torch import Tensor
+import saev.activations
+import saev.config
+import saev.nn
+import saev.visuals
+from .. import training
+from . import data
+####################
+# Global Constants #
+####################
+DEBUG = False
+"""Whether we are debugging."""
+max_frequency = 1e-2
+"""Maximum frequency. Any feature that fires more than this is ignored."""
+ckpt = "oebd6e6i"
+"""Which SAE checkpoint to use."""
+n_sae_latents = 3
+"""Number of SAE latents to show."""
+n_sae_examples = 4
+"""Number of SAE examples per latent to show."""
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+"""Hardware accelerator, if any."""
+RESIZE_SIZE = 512
+"""Resize shorter size to this size in pixels."""
+CROP_SIZE = (448, 448)
+"""Crop size in pixels."""
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+"""Hardware accelerator, if any."""
+####################
+# Helper Functions #
+####################
+@beartype.beartype
+def load_tensor(path: str) -> Tensor:
+    return torch.load(path, weights_only=True, map_location="cpu")
+##########
+# Models #
+##########
+@functools.cache
+def load_vit(
+    model_cfg: modeling.Config,
+) -> tuple[
+    activations.WrappedVisionTransformer,
+    typing.Callable,
+    float,
+    Float[Tensor, " d_vit"],
+]:
+    vit = (
+        saev.activations.WrappedVisionTransformer(model_cfg.wrapped_cfg)
+        .to(DEVICE)
+        .eval()
+    )
+    vit_transform = saev.activations.make_img_transform(
+        model_cfg.vit_family, model_cfg.vit_ckpt
+    )
+    logger.info("Loaded ViT: %s.", model_cfg.key)
+    try:
+        # Normalizing constants
+        acts_dataset = saev.activations.Dataset(model_cfg.acts_cfg)
+        logger.info("Loaded dataset norms: %s.", model_cfg.key)
+    except RuntimeError as err:
+        logger.warning("Error loading ViT: %s", err)
+        return None, None, None, None
+    return vit, vit_transform, acts_dataset.scalar.item(), acts_dataset.act_mean
+sae_ckpt_fpath = f"/home/stevens.994/projects/saev/checkpoints/{ckpt}/sae.pt"
+sae = saev.nn.load(sae_ckpt_fpath)
+sae.to(device).eval()
+head_ckpt_fpath = "/home/stevens.994/projects/saev/checkpoints/contrib/semseg/lr_0_001__wd_0_001/model_step8000.pt"
+head = training.load(head_ckpt_fpath)
+head = head.to(device).eval()
+class RestOfDinoV2(torch.nn.Module):
+    def __init__(self, *, n_end_layers: int):
+        super().__init__()
+        self.vit = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14_reg")
+        self.n_end_layers = n_end_layers
+    def forward_start(self, x: Float[Tensor, "batch channels width height"]):
+        x_BPD = self.vit.prepare_tokens_with_masks(x)
+        for blk in self.vit.blocks[: -self.n_end_layers]:
+            x_BPD = blk(x_BPD)
+        return x_BPD
+    def forward_end(self, x_BPD: Float[Tensor, "batch n_patches dim"]):
+        for blk in self.vit.blocks[-self.n_end_layers :]:
+            x_BPD = blk(x_BPD)
+        x_BPD = self.vit.norm(x_BPD)
+        return x_BPD[:, self.vit.num_register_tokens + 1 :]
+rest_of_vit = RestOfDinoV2(n_end_layers=1)
+rest_of_vit = rest_of_vit.to(device)
+####################
+# Global Variables #
+####################
+ckpt_data_root = (
+    f"/research/nfs_su_809/workspace/stevens.994/saev/features/{ckpt}/sort_by_patch"
+)
+top_img_i = load_tensor(os.path.join(ckpt_data_root, "top_img_i.pt"))
+top_values = load_tensor(os.path.join(ckpt_data_root, "top_values.pt"))
+sparsity = load_tensor(os.path.join(ckpt_data_root, "sparsity.pt"))
+mask = torch.ones((sae.cfg.d_sae), dtype=bool)
+mask = mask & (sparsity < max_frequency)
+############
+# Datasets #
+############
+# in1k_dataset = saev.activations.get_dataset(
+#     saev.config.ImagenetDataset(),
+#     img_transform=v2.Compose([
+#         v2.Resize(size=(512, 512)),
+#         v2.CenterCrop(size=(448, 448)),
+#     ]),
+# )
+# acts_dataset = saev.activations.Dataset(
+#     saev.config.DataLoad(
+#         shard_root="/local/scratch/stevens.994/cache/saev/a1f842330bb568b2fb05c15d4fa4252fb7f5204837335000d9fd420f120cd03e",
+#         scale_mean=not DEBUG,
+#         scale_norm=not DEBUG,
+#         layer=-2,
+#     )
+# )
+# vit_dataset = saev.activations.Ade20k(
+#     saev.config.Ade20kDataset(
+#         root="/research/nfs_su_809/workspace/stevens.994/datasets/ade20k/"
+#     ),
+#     img_transform=v2.Compose([
+#         v2.Resize(size=(256, 256)),
+#         v2.CenterCrop(size=(224, 224)),
+#         v2.ToImage(),
+#         v2.ToDtype(torch.float32, scale=True),
+#         v2.Normalize(mean=[0.4850, 0.4560, 0.4060], std=[0.2290, 0.2240, 0.2250]),
+#     ]),
+# )
+#######################
+# Inference Functions #
+#######################
+@beartype.beartype
+class Example(typing.TypedDict):
+    """Represents an example image and its associated label.
+    Used to store examples of SAE latent activations for visualization.
+    """
+    orig_url: str
+    """The URL or path to access the original example image."""
+    highlighted_url: str
+    """The URL or path to access the SAE-highlighted image."""
+    index: int
+    """Dataset index."""
+@beartype.beartype
+class SaeActivation(typing.TypedDict):
+    """Represents the activation pattern of a single SAE latent across patches.
+    This captures how strongly a particular SAE latent fires on different patches of an input image.
+    """
+    latent: int
+    """The index of the SAE latent being measured."""
+    highlighted_url: str
+    """The image with the colormaps applied."""
+    activations: list[float]
+    """The activation values of this latent across different patches. Each value represents how strongly this latent fired on a particular patch."""
+    examples: list[Example]
+    """Top examples for this latent."""
+@beartype.beartype
+def get_image(image_i: int) -> tuple[str, str, int]:
+    img_sized, labels_sized = data.get_sample(image_i)
+    return data.pil_to_base64(img_sized), data.pil_to_base64(labels_sized), image_i
+@beartype.beartype
+@torch.inference_mode
+def get_sae_activations(image_i: int, patches: list[int]) -> list[SaeActivation]:
+    """
+    Given a particular cell, returns some highlighted images showing what feature fires most on this cell.
+    """
+    if not patches:
+        return []
+    vit, vit_transform, scalar, mean = load_vit(model_cfg)
+    if vit is None:
+        logger.warning("Skipping ViT '%s'", model_name)
+        return []
+    sae = load_sae(model_cfg)
+    mean = mean.to(DEVICE)
+    x = vit_transform(img_p)[None, ...].to(DEVICE)
+    _, vit_acts_BLPD = vit(x)
+    vit_acts_PD = (vit_acts_BLPD[0, 0, 1:].to(DEVICE).clamp(-1e-5, 1e5) - mean) / scalar
+    _, f_x_PS, _ = sae(vit_acts_PD)
+    # Ignore [CLS] token and get just the requested latents.
+    acts_SP = einops.rearrange(f_x_PS, "patches n_latents -> n_latents patches")
+    logger.info("Got SAE activations for '%s'.", model_name)
+    top_img_i, top_values = load_tensors(model_cfg)
+    logger.info("Loaded top SAE activations for '%s'.", model_name)
+    breakpoint()
+    vit_acts_MD = torch.stack([
+        acts_dataset[image_i * acts_dataset.metadata.n_patches_per_img + i]["act"]
+        for i in patches
+    ]).to(device)
+    _, f_x_MS, _ = sae(vit_acts_MD)
+    f_x_S = f_x_MS.sum(axis=0)
+    latents = torch.argsort(f_x_S, descending=True).cpu()
+    latents = latents[mask[latents]][:n_sae_latents].tolist()
+    images = []
+    for latent in latents:
+        elems, seen_i_im = [], set()
+        for i_im, values_p in zip(top_img_i[latent].tolist(), top_values[latent]):
+            if i_im in seen_i_im:
+                continue
+            example = in1k_dataset[i_im]
+            elems.append(
+                saev.visuals.GridElement(example["image"], example["label"], values_p)
+            )
+            seen_i_im.add(i_im)
+        # How to scale values.
+        upper = None
+        if top_values[latent].numel() > 0:
+            upper = top_values[latent].max().item()
+        latent_images = [make_img(elem, upper=upper) for elem in elems[:n_sae_examples]]
+        while len(latent_images) < n_sae_examples:
+            latent_images += [None]
+        images.extend(latent_images)
+    return images + latents
+@torch.inference_mode
+def get_true_labels(image_i: int) -> Image.Image:
+    seg = human_dataset[image_i]["segmentation"]
+    image = seg_to_img(seg)
+    return image
+@torch.inference_mode
+def get_pred_labels(i: int) -> list[Image.Image | list[int]]:
+    sample = vit_dataset[i]
+    x = sample["image"][None, ...].to(device)
+    x_BPD = rest_of_vit.forward_start(x)
+    x_BPD = rest_of_vit.forward_end(x_BPD)
+    x_WHD = einops.rearrange(x_BPD, "() (w h) dim -> w h dim", w=16, h=16)
+    logits_WHC = head(x_WHD)
+    pred_WH = logits_WHC.argmax(axis=-1)
+    preds = einops.rearrange(pred_WH, "w h -> (w h)").tolist()
+    return [seg_to_img(upsample(pred_WH)), preds]
+@beartype.beartype
+def unscaled(x: float, max_obs: float) -> float:
+    """Scale from [-10, 10] to [10 * -max_obs, 10 * max_obs]."""
+    return map_range(x, (-10.0, 10.0), (-10.0 * max_obs, 10.0 * max_obs))
+@beartype.beartype
+def map_range(
+    x: float,
+    domain: tuple[float | int, float | int],
+    range: tuple[float | int, float | int],
+):
+    a, b = domain
+    c, d = range
+    if not (a <= x <= b):
+        raise ValueError(f"x={x:.3f} must be in {[a, b]}.")
+    return c + (x - a) * (d - c) / (b - a)
+@torch.inference_mode
+def get_modified_labels(
+    i: int,
+    latent1: int,
+    latent2: int,
+    latent3: int,
+    value1: float,
+    value2: float,
+    value3: float,
+) -> list[Image.Image | list[int]]:
+    sample = vit_dataset[i]
+    x = sample["image"][None, ...].to(device)
+    x_BPD = rest_of_vit.forward_start(x)
+    x_hat_BPD, f_x_BPS, _ = sae(x_BPD)
+    err_BPD = x_BPD - x_hat_BPD
+    values = torch.tensor(
+        [
+            unscaled(float(value), top_values[latent].max().item())
+            for value, latent in [
+                (value1, latent1),
+                (value2, latent2),
+                (value3, latent3),
+            ]
+        ],
+        device=device,
+    )
+    f_x_BPS[..., torch.tensor([latent1, latent2, latent3], device=device)] = values
+    # Reproduce the SAE forward pass after f_x
+    modified_x_hat_BPD = (
+        einops.einsum(
+            f_x_BPS,
+            sae.W_dec,
+            "batch patches d_sae, d_sae d_vit -> batch patches d_vit",
+        )
+        + sae.b_dec
+    )
+    modified_BPD = err_BPD + modified_x_hat_BPD
+    modified_BPD = rest_of_vit.forward_end(modified_BPD)
+    logits_BPC = head(modified_BPD)
+    pred_P = logits_BPC[0].argmax(axis=-1)
+    pred_WH = einops.rearrange(pred_P, "(w h) -> w h", w=16, h=16)
+    return seg_to_img(upsample(pred_WH)), pred_P.tolist()
+@jaxtyped(typechecker=beartype.beartype)
+@torch.inference_mode
+def upsample(
+    x_WH: Int[Tensor, "width_ps height_ps"],
+) -> UInt8[Tensor, "width_px height_px"]:
+    return (
+        torch.nn.functional.interpolate(
+            x_WH.view((1, 1, 16, 16)).float(),
+            scale_factor=28,
+        )
+        .view((448, 448))
+        .type(torch.uint8)
+    )
+@beartype.beartype
+def make_img(
+    elem: saev.visuals.GridElement, *, upper: float | None = None
+) -> Image.Image:
+    # Resize to 256x256 and crop to 224x224
+    resize_size_px = (512, 512)
+    resize_w_px, resize_h_px = resize_size_px
+    crop_size_px = (448, 448)
+    crop_w_px, crop_h_px = crop_size_px
+    crop_coords_px = (
+        (resize_w_px - crop_w_px) // 2,
+        (resize_h_px - crop_h_px) // 2,
+        (resize_w_px + crop_w_px) // 2,
+        (resize_h_px + crop_h_px) // 2,
+    )
+    img = elem.img.resize(resize_size_px).crop(crop_coords_px)
+    img = saev.imaging.add_highlights(
+        img, elem.patches.numpy(), upper=upper, opacity=0.5
+    )
+    return img
+with gr.Blocks() as demo:
+    image_number = gr.Number(label="Validation Example")
+    input_image_base64 = gr.Text(label="Image in Base64")
+    true_labels_base64 = gr.Text(label="Labels in Base64")
+    get_input_image_btn = gr.Button(value="Get Input Image")
+    get_input_image_btn.click(
+        get_image,
+        inputs=[image_number],
+        outputs=[input_image_base64, true_labels_base64, image_number],
+        api_name="get-image",
+    )
+    # input_image = gr.Image(
+    #     label="Input Image",
+    #     sources=["upload", "clipboard"],
+    #     type="pil",
+    #     interactive=True,
+    # )
+    # patch_numbers = gr.CheckboxGroup(label="Image Patch", choices=list(range(256)))
+    # top_latent_numbers = gr.CheckboxGroup(label="Top Latents")
+    # top_latent_numbers = [
+    #     gr.Number(label="Top Latents #{j+1}") for j in range(n_sae_latents)
+    # ]
+    # sae_example_images = [
+    #     gr.Image(label=f"Latent #{j}, Example #{i + 1}", format="png")
+    #     for i in range(n_sae_examples)
+    #     for j in range(n_sae_latents)
+    # ]
+    patches_json = gr.JSON(label="Patches", value=[])
+    activations_json = gr.JSON(label="Activations", value=[])
+    get_sae_activations_btn = gr.Button(value="Get SAE Activations")
+    get_sae_activations_btn.click(
+        get_sae_activations,
+        inputs=[image_number, patches_json],
+        outputs=[activations_json],
+        api_name="get-sae-examples",
+    )
+    # semseg_image = gr.Image(label="Semantic Segmentaions", format="png")
+    # semseg_colors = gr.CheckboxGroup(
+    #     label="Sem Seg Colors", choices=list(range(1, 151))
+    # )
+    # get_pred_labels_btn = gr.Button(value="Get Pred. Labels")
+    # get_pred_labels_btn.click(
+    #     get_pred_labels,
+    #     inputs=[image_number],
+    #     outputs=[semseg_image, semseg_colors],
+    #     api_name="get-pred-labels",
+    # )
+    # get_true_labels_btn = gr.Button(value="Get True Label")
+    # get_true_labels_btn.click(
+    #     get_true_labels,
+    #     inputs=[image_number],
+    #     outputs=semseg_image,
+    #     api_name="get-true-labels",
+    # )
+    # latent_numbers = [gr.Number(label=f"Latent {i + 1}") for i in range(3)]
+    # value_sliders = [
+    #     gr.Slider(label=f"Value {i + 1}", minimum=-10, maximum=10) for i in range(3)
+    # ]
+    # get_modified_labels_btn = gr.Button(value="Get Modified Label")
+    # get_modified_labels_btn.click(
+    #     get_modified_labels,
+    #     inputs=[image_number] + latent_numbers + value_sliders,
+    #     outputs=[semseg_image, semseg_colors],
+    #     api_name="get-modified-labels",
+    # )
+if __name__ == "__main__":
+    demo.launch()

data.py ADDED Viewed

File without changes

justfile ADDED Viewed

	@@ -0,0 +1,9 @@

+build: lint
+    uv pip compile pyproject.toml > requirements.txt
+lint: fmt
+    git ls-files "*.py" --cached --others --exclude-standard | xargs uv run ruff check
+fmt:
+    git ls-files "*.py" --cached --others --exclude-standard | xargs uv run isort
+    git ls-files "*.py" --cached --others --exclude-standard | xargs uv run ruff format --preview

pyproject.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[project]
+name = "saev-semantic-segmentation"
+version = "0.1.0"
+description = "Gradio app space for semantic segmentation with SAEs"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "beartype>=0.19.0",
+    "einops>=0.8.0",
+    "gradio>=5.3.0",
+    "numpy>=2.2.2",
+    "torch>=2.6.0",
+    "torchvision>=0.21.0",
+]
+[tool.ruff.lint]
+ignore = ["F722"]
+[tool.uv.sources]
+saev = { git = "https://github.com/samuelstevens/saev" }