Spaces:

nvidia
/

radio

Running

App Files Files Community

gheinrich commited on Jul 28, 2024

Commit

2de3215

verified ·

1 Parent(s): 6d024c6

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -35

app.py CHANGED Viewed

@@ -1,47 +1,98 @@
 import os
 # Disable JIT
 os.environ["PYTORCH_JIT"] = "0"
 from einops import rearrange
 import gradio as gr
 import spaces
-import torch
 import torch.nn.functional as F
 from PIL import Image, ImageOps
 from transformers import AutoModel, CLIPImageProcessor
-hf_repo = "nvidia/RADIO-L"
-image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
-model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)
-model.eval()
-title = """RADIO: Reduce All Domains Into One"""
-description = """
-# RADIO
-AM-RADIO is a framework to distill Large Vision Foundation models into a single one.
-RADIO, a new vision foundation model, excels across visual domains, serving as a superior replacement for vision backbones.
-Integrating CLIP variants, DINOv2, and SAM through distillation, it preserves unique features like text grounding and segmentation correspondence.
-Outperforming teachers in ImageNet zero-shot (+6.8%), kNN (+2.39%), and linear probing segmentation (+3.8%) and vision-language models (LLaVa 1.5 up to 1.5%), it scales to any resolution, supports non-square images.
-# Instructions
-Simply paste an image or pick one from the gallery of examples and then click the "Submit" button.
-"""
-inputs = [
-    gr.Image(type="pil")
-]
-examples = "./samples/"
-outputs = [
-    gr.Textbox(label="Feature Shape"),
-    gr.Image(),
-]
 def get_robust_pca(features: torch.Tensor, m: float = 2, remove_first_component=False):
     # features: (N, C)
@@ -110,11 +161,11 @@ def get_pca_map(
     return pca_color
-def pad_image_to_multiple_of_16(image):
-    # Calculate the new dimensions to make them multiples of 16
     width, height = image.size
-    new_width = (width + 15) // 16 * 16
-    new_height = (height + 15) // 16 * 16
     # Calculate the padding needed on each side
     pad_width = new_width - width
@@ -131,17 +182,83 @@ def pad_image_to_multiple_of_16(image):
     return padded_image
 @spaces.GPU
 def infer_radio(image):
     """Define the function to generate the output."""
     model.cuda()
-    image=pad_image_to_multiple_of_16(image)
-    width, height = image.size
-    pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
-    pixel_values = pixel_values.to(torch.bfloat16).cuda()
-    _, features = model(pixel_values)
     num_rows = height // model.patch_size
     num_cols = width // model.patch_size
@@ -150,15 +267,49 @@ def infer_radio(image):
     features = rearrange(features, 'b (h w) c -> b h w c', h=num_rows, w=num_cols).float()
     pca_viz = get_pca_map(features, (height, width), interpolation='bilinear')
-    return f"{features.shape}", pca_viz
 # Create the Gradio interface
 demo = gr.Interface(
     fn=infer_radio,
     inputs=inputs,
-    examples=examples,
     outputs=outputs,
     title=title,
     description=description,
@@ -167,4 +318,3 @@ demo = gr.Interface(
 if __name__ == "__main__":
     demo.launch()

 import os
+import requests
 # Disable JIT
 os.environ["PYTORCH_JIT"] = "0"
 from einops import rearrange
 import gradio as gr
+import numpy as np
 import spaces
+import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image, ImageOps
 from transformers import AutoModel, CLIPImageProcessor
+from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
+from segment_anything.modeling.image_encoder import ImageEncoderViT
+class RADIOVenc(nn.Module):
+    def __init__(self, radio: nn.Module, img_enc: ImageEncoderViT, img_size: int = 1024):
+        super().__init__()
+        self.radio = radio
+        self.neck = img_enc.neck
+        self.img_size = img_size
+        self.dtype = radio.input_conditioner.dtype
+    def forward(self, x: torch.Tensor):
+        h, w = x.shape[-2:]
+        if self.dtype is not None:
+            x = x.to(dtype=self.dtype)
+        with torch.autocast('cuda', dtype=torch.bfloat16, enabled=self.dtype is None):
+            output = self.radio(x)
+        features = output["sam"].features
+        rows = h // 16
+        cols = w // 16
+        features = rearrange(features, 'b (h w) c -> b c h w', h=rows, w=cols)
+        features = self.neck(features)
+        return features
+def download_file(url, save_path):
+    # Check if the file already exists
+    if os.path.exists(save_path):
+        print(f"File already exists at {save_path}. Skipping download.")
+        return
+    print(f"Downloading from {url}")
+    # Send a GET request to the URL
+    response = requests.get(url, stream=True)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Open the file in binary write mode
+        with open(save_path, 'wb') as file:
+            # Iterate over the response content in chunks
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    file.write(chunk)
+        print(f"File downloaded successfully and saved as {save_path}")
+    else:
+        print(f"Failed to download file. HTTP Status Code: {response.status_code}")
+hf_repo = "nvidia/RADIO-L"
+image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
+model_version = "radio_v2.5-l" # for RADIOv2.5-L model (ViT-L/16)
+model = torch.hub.load(
+    'NVlabs/RADIO',
+    'radio_model',
+    version=model_version,
+    progress=True,
+    skip_validation=True,
+    adaptor_names='sam',
+    vitdet_window_size=16)
+model.eval()
+local_sam_checkpoint_path = "sam_vit_h_4b8939.pth"
+download_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", local_sam_checkpoint_path)
+sam = sam_model_registry["vit_h"](checkpoint=local_sam_checkpoint_path)
+model._patch_size = 16
+sam.image_encoder = RADIOVenc(model, sam.image_encoder, img_size=1024)
+conditioner = model.make_preprocessor_external()
+sam.pixel_mean = conditioner.norm_mean * 255
+sam.pixel_std = conditioner.norm_std * 255
 def get_robust_pca(features: torch.Tensor, m: float = 2, remove_first_component=False):
     # features: (N, C)
     return pca_color
+def pad_image_to_multiple_of(image, multiple=16):
+    # Calculate the new dimensions to make them multiples
     width, height = image.size
+    new_width = (width + multiple -1) // multiple * multiple
+    new_height = (height + multiple -1) // multiple * multiple
     # Calculate the padding needed on each side
     pad_width = new_width - width
     return padded_image
+def center_crop_resize(image, size=(1024, 1024)):
+    # Get dimensions
+    width, height = image.size
+    # Determine the center crop box
+    if width > height:
+        new_width = height
+        new_height = height
+        left = (width - new_width) / 2
+        top = 0
+        right = (width + new_width) / 2
+        bottom = height
+    else:
+        new_width = width
+        new_height = width
+        left = 0
+        top = (height - new_height) / 2
+        right = width
+        bottom = (height + new_height) / 2
+    # Crop the image to a square
+    image = image.crop((left, top, right, bottom))
+    # Resize the cropped image to the target size
+    image = image.resize(size, Image.LANCZOS)
+    return image
+def visualize_anns(orig_image: np.ndarray, anns):
+    if len(anns) == 0:
+        return orig_image
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    kernel = torch.ones(1, 1, 5, 5, dtype=torch.float32)
+    # RGBA
+    mask = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 4), dtype=np.float32)
+    mask[:,:,3] = 0
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        color_mask = np.concatenate([np.random.random(3), [0.35]])
+        tm = torch.as_tensor(m).reshape(1, 1, *m.shape).float()
+        cvtm = F.conv2d(tm, kernel, padding=2)
+        border_mask = (cvtm < 25).flatten(0, 2).numpy()
+        mask[m] = color_mask
+        mask[m & border_mask, 3] *= 1.0 / 0.35
+    color, alpha = mask[..., :3], mask[..., -1:]
+    orig_image = orig_image.astype(np.float32) / 255
+    overlay = alpha * color + (1 - alpha) * orig_image
+    overlay = (overlay * 255).astype(np.uint8)
+    return overlay
 @spaces.GPU
 def infer_radio(image):
     """Define the function to generate the output."""
     model.cuda()
+    conditioner.cuda()
+    sam.cuda()
+    sam_generator = SamAutomaticMaskGenerator(sam, output_mode="binary_mask")
+    # PCA feature visalization
+    padded_image=pad_image_to_multiple_of(image, multiple=256)
+    width, height = padded_image.size
+    pixel_values = image_processor(images=padded_image, return_tensors='pt').pixel_values
+    pixel_values = pixel_values.to(torch.bfloat16).cuda()
+    pixel_values = conditioner(pixel_values)
+    _, features = model(pixel_values)["backbone"]
     num_rows = height // model.patch_size
     num_cols = width // model.patch_size
     features = rearrange(features, 'b (h w) c -> b h w c', h=num_rows, w=num_cols).float()
     pca_viz = get_pca_map(features, (height, width), interpolation='bilinear')
+    # SAM feature visualization
+    resized_image = center_crop_resize(image)
+    image_array = np.array(image)
+    print("image size", image_array.shape)
+    #image_array = np.transpose(image_array, (2, 0, 1))
+    masks = sam_generator.generate(image_array)
+    overlay = visualize_anns(image_array, masks)
+    return f"{features.shape}", pca_viz, overlay
+title = """RADIO: Reduce All Domains Into One"""
+description = """
+# RADIO
+AM-RADIO is a framework to distill Large Vision Foundation models into a single one.
+RADIO, a new vision foundation model, excels across visual domains, serving as a superior replacement for vision backbones.
+Integrating CLIP variants, DINOv2, and SAM through distillation, it preserves unique features like text grounding and segmentation correspondence.
+Outperforming teachers in ImageNet zero-shot (+6.8%), kNN (+2.39%), and linear probing segmentation (+3.8%) and vision-language models (LLaVa 1.5 up to 1.5%), it scales to any resolution, supports non-square images.
+# Instructions
+Simply paste an image or pick one from the gallery of examples and then click the "Submit" button.
+"""
+inputs = [
+    gr.Image(type="pil")
+]
+outputs = [
+    gr.Textbox(label="Feature Shape"),
+    gr.Image(label="PCA Feature Visalization"),
+    gr.Image(label="SAM Masks"),
+]
 # Create the Gradio interface
 demo = gr.Interface(
     fn=infer_radio,
     inputs=inputs,
+    examples="./samples/",
     outputs=outputs,
     title=title,
     description=description,
 if __name__ == "__main__":
     demo.launch()