Spaces:

kevinssy
/

CLIP_as_RNN

Runtime error

App Files Files Community

Kevin Sun commited on May 6, 2024

Commit

6cd90b7

1 Parent(s): 8927fea

init commit

Browse files

Files changed (46) hide show

CLIP_as_RNN +1 -0
README.md +13 -12
app.py +285 -0
configs/ade_150.yaml +55 -0
configs/ade_847.yaml +55 -0
configs/coco.yaml +51 -0
configs/gres.yaml +38 -0
configs/pascal_context.yaml +60 -0
configs/pascal_context_459.yaml +55 -0
configs/refcoco+.yaml +34 -0
configs/refcoco.yaml +34 -0
configs/refcocog.yaml +37 -0
configs/voc.yaml +63 -0
data/__init__.py +15 -0
data/ade.py +544 -0
data/ade847.py +1827 -0
data/coco.py +137 -0
data/context.py +126 -0
data/gres.py +455 -0
data/pascal459.py +998 -0
data/preprocess.py +110 -0
data/refcoco.py +449 -0
data/voc.py +148 -0
demo.py +227 -0
evaluate.py +511 -0
modeling/__init__.py +15 -0
modeling/model/cam.py +222 -0
modeling/model/car.py +318 -0
modeling/model/clip_wrapper.py +297 -0
modeling/model/clipcam.py +255 -0
modeling/model/crf.py +113 -0
modeling/model/utils.py +245 -0
modeling/model/utils_test.py +129 -0
modeling/post_process/object_discovery.py +355 -0
modeling/post_process/post_process.py +167 -0
requirements.txt +4 -0
sam/__init__.py +19 -0
sam/sam.py +205 -0
sam/utils.py +239 -0
utils/__init__.py +15 -0
utils/inference_pipeline.py +83 -0
utils/merge_mask.py +57 -0
utils/metrics.py +75 -0
utils/nlp.py +94 -0
utils/utils.py +277 -0
utils/visualize.py +107 -0

CLIP_as_RNN ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 2457b49b339498af726408aa6673155de408c0f0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
----
-title: CLIP As RNN
-emoji: 🏢
-colorFrom: purple
-colorTo: purple
-sdk: gradio
-sdk_version: 4.29.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor (CaR)
+This repo holds the implementation code of the paper [CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor (CaR)](https://arxiv.org/abs/2312.07661) by Shuyang Sun, Runjia Li, Philip Torr, Xiuye Gu, and Siyang Li:
+```
+@article{clip_as_rnn,
+      title={CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor},
+      author={Shuyang Sun and Runjia Li and Philip Torr and Xiuye Gu and Siyang Li},
+      year={2023},
+      eprint={2312.07661},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""Run a Gradio demo of the CaR model on a single image."""
+import numpy as np
+import argparse
+from functools import reduce
+import PIL.Image as Image
+import torch
+from modeling.model import CaR
+from utils.utils import Config, load_yaml
+import matplotlib.pyplot as plt
+import colorsys
+from modeling.post_process.post_process import match_masks, generate_masks_from_sam
+from sam.sam import SAMPipeline
+from sam.utils import build_sam_config
+import random
+import gradio as gr
+# set random seed
+random.seed(15)
+np.random.seed(0)
+torch.manual_seed(0)
+CFG_PATH = "configs/demo/pokemon.yaml"
+def generate_distinct_colors(n):
+    colors = []
+    # generate a random number from 0 to 1
+    random_color_bias = random.random()
+    for i in range(n):
+        hue = float(i) / n
+        hue += random_color_bias
+        hue = hue % 1.0
+        rgb = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+        # Convert RGB values from [0, 1] range to [0, 255]
+        colors.append(tuple(int(val * 255) for val in rgb))
+    return colors
+def overlap_masks(masks):
+    """
+    Overlap masks to generate a single mask for visualization.
+    Parameters:
+    - masks: list of np.arrays of shape (H, W) representing binary masks for each class
+    Returns:
+    - overlap_mask: list of np.array of shape (H, W) that have no overlaps
+    """
+    overlap_mask = torch.zeros_like(masks[0])
+    for mask_idx, mask in enumerate(masks):
+        overlap_mask[mask > 0] = mask_idx + 1
+    clean_masks = [overlap_mask == mask_idx +
+                   1 for mask_idx in range(len(masks))]
+    clean_masks = torch.stack(clean_masks, dim=0)
+    return clean_masks
+def visualize_segmentation(image,
+                           masks,
+                           class_names,
+                           alpha=0.7,
+                           y_list=None,
+                           x_list=None):
+    """
+    Visualize segmentation masks on an image.
+    Parameters:
+    - image: np.array of shape (H, W, 3) representing the RGB image
+    - masks: list of np.arrays of shape (H, W) representing binary masks for each class
+    - class_names: list of strings representing names of each class
+    - alpha: float, transparency level of masks on the image
+    Returns:
+    - visualization: plt.figure object
+    """
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(12, 9))
+    # Display the image
+    # ax.imshow(image)
+    # Generate distinct colors for each mask
+    final_mask = np.zeros(
+        (masks.shape[1], masks.shape[2], 3), dtype=np.float32)
+    binary_final_mask = np.zeros(
+        (masks.shape[1], masks.shape[2]), dtype=np.float32)
+    colors = generate_distinct_colors(len(class_names))
+    idx = 0
+    for mask, color, class_name in zip(masks, colors, class_names):
+        # Overlay the mask
+        final_mask += np.dstack([mask * c for c in color])
+        binary_final_mask += mask
+        # Find a representative point (e.g., centroid) for placing the label
+        if y_list is None or x_list is None:
+            y, x = np.argwhere(mask).mean(axis=0)
+        else:
+            y, x = y_list[idx], x_list[idx]
+        ax.text(x, y, class_name, color='white',
+                fontsize=22, va='center', ha='center',
+                bbox=dict(facecolor='black', alpha=0.7, edgecolor='none'))
+        idx += 1
+    image[binary_final_mask > 0] = image[binary_final_mask > 0] * (1 - alpha)
+    final_image = image + final_mask * alpha
+    final_image = final_image.astype(np.uint8)
+    ax.imshow(final_image)
+    # Remove axis ticks and labels
+    ax.axis('off')
+    return fig
+def get_sam_masks(cfg,
+                  masks,
+                  image_path=None,
+                  img_sam=None,
+                  pipeline=None):
+    # image_id = image_path.split('/')[-1].split('.')[0]
+    # sam_mask_path = os.path.join(cfg.test.sam_mask_root, f'{image_id}.npz')
+    # if os.path.exists(sam_mask_path):
+    #     sam_mask_masks = np.load(sam_mask_path, allow_pickle=True)
+    #     mask_tensor = torch.from_numpy(sam_mask_masks['mask_tensor'])
+    #     mask_list = sam_mask_path['mask_list']
+    # else:
+    print("generating sam masks online")
+    if img_sam is None and image_path is not None:
+        raise ValueError(
+            'Please provide either the image path or the image numpy array.')
+    mask_tensor, mask_list = generate_masks_from_sam(
+        image_path,
+        save_path='./',
+        pipeline=pipeline,
+        img_sam=img_sam,
+        visualize=False,
+    )
+    mask_tensor = mask_tensor.to(masks.device)
+    # only conduct sam on masks that is not all zero
+    attn_map, mask_ids = [], []
+    for mask_id, mask in enumerate(masks):
+        if torch.sum(mask) > 0:
+            attn_map.append(mask.unsqueeze(0))
+            mask_ids.append(mask_id)
+    matched_masks = [match_masks(
+        mask_tensor,
+        attn,
+        mask_list,
+        iom_thres=cfg.car.iom_thres,
+        min_pred_threshold=cfg.sam.min_pred_threshold)
+        for attn in attn_map]
+    for matched_mask, mask_id in zip(matched_masks, mask_ids):
+        sam_masks = np.array([item['segmentation'] for item in matched_mask])
+        sam_mask = np.any(sam_masks, axis=0)
+        masks[mask_id] = torch.from_numpy(sam_mask).to(masks.device)
+    return masks
+def load_sam(cfg, device):
+    sam_checkpoint, model_type = build_sam_config(cfg)
+    pipeline = SAMPipeline(
+        sam_checkpoint,
+        model_type,
+        device=device,
+        points_per_side=cfg.sam.points_per_side,
+        pred_iou_thresh=cfg.sam.pred_iou_thresh,
+        stability_score_thresh=cfg.sam.stability_score_thresh,
+        box_nms_thresh=cfg.sam.box_nms_thresh,
+    )
+    return pipeline
+def generate(img,
+             class_names,
+             clip_thresh,
+             mask_thresh,
+             confidence_thresh,
+             post_process,
+             stability_score_thresh,
+             box_nms_thresh,
+             iom_thres,
+             min_pred_threshold):
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    cfg = Config(**load_yaml(CFG_PATH))
+    cfg.car.clipes_threshold = clip_thresh
+    cfg.car.mask_threshold = mask_thresh
+    cfg.car.confidence_threshold = confidence_thresh
+    cfg.sam.stability_score_thresh = stability_score_thresh
+    cfg.sam.box_nms_thresh = box_nms_thresh
+    cfg.car.iom_thres = iom_thres
+    cfg.sam.min_pred_threshold = min_pred_threshold
+    car_model = CaR(cfg,
+                    visualize=True,
+                    seg_mode='semantic',
+                    device=device)
+    # resize image by dividing 2 if the size is larger than 1000
+    if img.size[0] > 1000:
+        img = img.resize((img.size[0] // 2, img.size[1] // 2))
+    y_list, x_list = None, None
+    class_names = class_names.split(',')
+    sentences = class_names
+    # class_names = ['the women chatting', 'the women chatting', 'table', 'fridge', 'cooking pot']
+    pseudo_masks, _, _ = car_model(
+        img, sentences, 1)
+    if post_process == 'SAM':
+        pipeline = load_sam(cfg, device)
+        pseudo_masks = get_sam_masks(
+            cfg,
+            pseudo_masks,
+            image_path=None,
+            img_sam=np.array(img),
+            pipeline=pipeline)
+        pseudo_masks = overlap_masks(pseudo_masks)
+    # visualize segmentation masks
+    demo_fig = visualize_segmentation(np.array(img),
+                                      pseudo_masks.detach().cpu().numpy(),
+                                      class_names,
+                                      y_list=y_list,
+                                      x_list=x_list)
+    # convert the demo figure to an pil image
+    demo_fig.canvas.draw()
+    demo_img = np.array(demo_fig.canvas.renderer._renderer)
+    demo_img = Image.fromarray(demo_img)
+    return demo_img
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('car')
+    parser.add_argument("--cfg-path",
+                        default='configs/local_car.yaml',
+                        help="path to configuration file.")
+    args = parser.parse_args()
+    demo = gr.Interface(generate,
+        inputs=[gr.Image(label="upload an image", type="pil"),
+                "text",
+                gr.Slider(label="clip thresh",
+                          minimum=0,
+                          maximum=1,
+                          value=0.4,
+                          step=0.1,
+                info="the threshold for clip-es adversarial heatmap clipping"),
+                gr.Slider(label="mask thresh",
+                          minimum=0,
+                          maximum=1,
+                          value=0.6,
+                          step=0.1,
+                info="the binariation threshold for the mask to generate visual prompt"),
+                gr.Slider(label="confidence thresh",
+                          minimum=0,
+                          maximum=1,
+                          value=0,
+                          step=0.1,
+                info="the threshold for filtering the proposed classes"),
+                gr.Radio(["CRF", "SAM"], label="post process", value="CRF", info="choose the post process method"),
+                gr.Slider(label="stability score thresh for SAM mask proposal \n(only when SAM is chosen for post process)",
+                          minimum=0,
+                          maximum=1,
+                          value=0.95,
+                          step=0.1),
+                gr.Slider(label="box nms thresh for SAM mask proposal \n(only when SAM is chosen for post process)", minimum=0, maximum=1, value=0.7, step=0.1),
+                gr.Slider(label="intersection over mask threshold for SAM mask proposal \n(only when SAM is chosen for post process)", minimum=0, maximum=1, value=0.5, step=0.1),
+                gr.Slider(label="minimum prediction threshold for SAM mask proposal \n(only when SAM is chosen for post process)", minimum=0, maximum=1, value=0.03, step=0.01)],
+        outputs="image",
+        title="CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor",
+        description="This is the official demo for CLIP as RNN. Please upload an image and type in the class names (connected by ',' e.g. cat,dog,human) you want to segment. The model will generate the segmentation masks for the input image. You can also adjust the clip thresh, mask thresh and confidence thresh to get better results.",
+        examples=[["demo/pokemon1.jpg", "Charmander,Bulbasaur,Squirtle", 0.6, 0.6, 0, "SAM", 0.95, 0.7, 0.6, 0.01],
+                    ["demo/batman.jpg", "Batman,Joker,Cat Woman", 0.6, 0.6, 0, "SAM", 0.95, 0.7, 0.6, 0.01],
+                    ["demo/avengers1.jpg", "Thor,Captain America,Hulk,Iron Man", 0.6, 0.6, 0, "SAM", 0.89, 0.65, 0.5, 0.03],
+                    ])
+    demo.launch(share=True)
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+stop = 0

configs/ade_150.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.6
+  mask_threshold: 0.6
+  min_area_ratio: 0.2
+  num_iteration: 1
+  confidence_threshold: 0.25
+  clipes_threshold: 0.7
+  bg_factor: 1
+  stuff_bg_factor: 1
+  visual_prompt_type: ['gray', 'blur']
+  stuff_visual_prompt_type: ['gray', 'blur']
+  semantic_templates: ['a clean origami {}.',
+                       'a photo of a {}.',
+                       'This is a photo of a {}',
+                       'There is a {} in the scene',
+                       'There is the {} in the scene',
+                       'a photo of a {} in the scene',
+                       'a photo of a small {}.',
+                       'a photo of a medium {}.',
+                       'a photo of a large {}.',
+                       'This is a photo of a small {}.',
+                       'This is a photo of a medium {}.',
+                       'This is a photo of a large {}.',
+                       'There is a small {} in the scene.',
+                       'There is a medium {} in the scene.',
+                       'There is a large {} in the scene.']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "ade"
+  seg_mode: "semantic"
+  split: 'validation'
+  data_root: "$YOUR_ADE_DATA_DIR"
+  # You need to extract the sam mask for the ADE dataset if use_pseudo=False
+  sam_mask_root: "$YOUR_SAM_MASK_DIR"
+  output_path: "./outputs/"
+  use_pseudo: True
+  n_class: 151
+  num_chunks: 1
+  chunk_index: 0
+  ignore_background: True
+save_path: "./outputs"

configs/ade_847.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.6
+  mask_threshold: 0.6
+  min_area_ratio: 0.2
+  num_iteration: 1
+  confidence_threshold: 0.25
+  clipes_threshold: 0.7
+  bg_factor: 1
+  stuff_bg_factor: 1
+  visual_prompt_type: ['gray', 'blur']
+  stuff_visual_prompt_type: ['gray', 'blur']
+  semantic_templates: ['a clean origami {}.',
+                       'a photo of a {}.',
+                       'This is a photo of a {}',
+                       'There is a {} in the scene',
+                       'There is the {} in the scene',
+                       'a photo of a {} in the scene',
+                       'a photo of a small {}.',
+                       'a photo of a medium {}.',
+                       'a photo of a large {}.',
+                       'This is a photo of a small {}.',
+                       'This is a photo of a medium {}.',
+                       'This is a photo of a large {}.',
+                       'There is a small {} in the scene.',
+                       'There is a medium {} in the scene.',
+                       'There is a large {} in the scene.']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "ade_847"
+  seg_mode: "semantic"
+  split: 'validation'
+  data_root: "$YOUR_ADE_DATA_DIR"
+  # You need to extract the sam mask for the ADE dataset if use_pseudo=False
+  sam_mask_root: "$YOUR_SAM_MASK_DIR"
+  output_path: "./outputs/"
+  use_pseudo: True
+  n_class: 847
+  num_chunks: 1
+  chunk_index: 0
+  ignore_background: True
+save_path: "./outputs"

configs/coco.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.7
+  mask_threshold: 0.5
+  min_area_ratio: 0.2
+  num_iteration: 1
+  confidence_threshold: 0.3
+  clipes_threshold: 0.5
+  visual_prompt_type: ['blur', 'gray']
+  semantic_templates: ['a clean origami {}.',
+                       'a photo of a {}.',
+                       'This is a photo of a {}',
+                       'There is a {} in the scene',
+                       'There is the {} in the scene',
+                       'a photo of a {} in the scene',
+                       'a photo of a small {}.',
+                       'a photo of a medium {}.',
+                       'a photo of a large {}.',
+                       'This is a photo of a small {}.',
+                       'This is a photo of a medium {}.',
+                       'This is a photo of a large {}.',
+                       'There is a small {} in the scene.',
+                       'There is a medium {} in the scene.',
+                       'There is a large {} in the scene.']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "coco"
+  seg_mode: "semantic"
+  data_root: "$YOUR_DATA_DIR"
+  # You need to extract the sam mask for the ADE dataset if use_pseudo=False
+  sam_mask_root: "$YOUR_SAM_MASK_DIR"
+  output_path: "./outputs/"
+  use_pseudo: True
+  split: "val"
+  n_class: 81
+  num_chunks: 1
+  chunk_index: 0
+save_path: "./outputs"

configs/gres.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.5
+  mask_threshold: 0.5
+  confidence_threshold: 0
+  clipes_threshold: 0.3
+  cam_text_template: 'a clean origami {}.'
+  color: [255, 0, 0]  # red
+  visual_prompt_type: ['circle']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "gres"
+  split: 'val'
+  seg_mode: "refer"
+  data_root: "$YOUR_ADE_DATA_DIR"
+  output_path: "./outputs/"
+  prompts_augment: False
+  use_pseudo: True
+  use_background: False
+  prompts_prefix: False
+  prompts_augment: False
+sentence_process:
+  mixing_alpha: 0.
+save_path: "./outputs"

configs/pascal_context.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.5
+  mask_threshold: 0.6
+  stuff_mask_threshold: 0.6
+  min_area_ratio: 0.2
+  num_iteration: 1
+  confidence_threshold: 0.25
+  clipes_threshold: 0.4
+  bg_factor: 1
+  stuff_bg_factor: 1
+  has_pamr: False
+  visual_prompt_type: ['blur', 'circle']
+  stuff_visual_prompt_type: ['blur', 'gray']
+  semantic_templates: ['a clean origami {}.',
+                       'a photo of a {}.',
+                       'This is a photo of a {}',
+                       'There is a {} in the scene',
+                       'There is the {} in the scene',
+                       'a photo of a {} in the scene',
+                       'a photo of a small {}.',
+                       'a photo of a medium {}.',
+                       'a photo of a large {}.',
+                       'This is a photo of a small {}.',
+                       'This is a photo of a medium {}.',
+                       'This is a photo of a large {}.',
+                       'There is a small {} in the scene.',
+                       'There is a medium {} in the scene.',
+                       'There is a large {} in the scene.']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+            'wall', 'sky', 'lake', 'water', 'river', 'sea',
+            'railway', 'railroad', 'helmet', 'cloud', 'house',
+            'mountain', 'ocean', 'road', 'rock', 'street',
+            'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "context"
+  seg_mode: "semantic"
+  n_class: 60
+  data_root: "$YOUR_DATA_DIR"
+  output_path: "./outputs/"
+  use_pseudo: True
+  split: "val"
+  num_chunks: 1
+  chunk_index: 0
+  ignore_background: False
+save_path: "./outputs"

configs/pascal_context_459.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.6
+  mask_threshold: 0.4
+  min_area_ratio: 0.2
+  num_iteration: 1
+  confidence_threshold: 0.25 # 0.2
+  clipes_threshold: 0.7
+  bg_factor: 1
+  stuff_bg_factor: 1
+  visual_prompt_type: ['gray', 'blur']
+  stuff_visual_prompt_type: ['gray', 'blur']
+  semantic_templates: ['a clean origami {}.',
+                       'a photo of a {}.',
+                       'This is a photo of a {}',
+                       'There is a {} in the scene',
+                       'There is the {} in the scene',
+                       'a photo of a {} in the scene',
+                       'a photo of a small {}.',
+                       'a photo of a medium {}.',
+                       'a photo of a large {}.',
+                       'This is a photo of a small {}.',
+                       'This is a photo of a medium {}.',
+                       'This is a photo of a large {}.',
+                       'There is a small {} in the scene.',
+                       'There is a medium {} in the scene.',
+                       'There is a large {} in the scene.']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "pascal_459"
+  seg_mode: "semantic"
+  split: 'validation'
+  data_root: "$YOUR_DATA_DIR"
+  # You need to extract the sam mask for the ADE dataset if use_pseudo=False
+  sam_mask_root: "$YOUR_SAM_MASK_DIR"
+  output_path: "./outputs/"
+  use_pseudo: True
+  n_class: 460
+  num_chunks: 1
+  chunk_index: 0
+  ignore_background: True
+save_path: "./outputs"

configs/refcoco+.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+clip:
+  semantic_clip_model_name: 'ViT-B/16'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.5
+  mask_threshold: 0.2
+  confidence_threshold: 0.1
+  clipes_threshold: 0.5  # refcocog: 0.6
+  color: [255, 0, 0]  # red
+  visual_prompt_type: ['circle', 'blur']
+  min_area_ratio: 0.2
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "refcoco+"
+  seg_mode: "refer"
+  split: 'val'
+  data_root: "$YOUR_DATA_DIR"
+  output_path: "./outputs/"
+  prompts_augment: False
+  use_pseudo: True
+sentence_process:
+  mixing_alpha: 0.
+save_path: "./outputs"

configs/refcoco.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+clip:
+  semantic_clip_model_name: 'ViT-B/16'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.5
+  mask_threshold: 0.5
+  confidence_threshold: 0.3
+  clipes_threshold: 0.5
+  color: [255, 0, 0]  # red
+  visual_prompt_type: ['circle']
+  min_area_ratio: 0.2
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "refcoco"
+  seg_mode: "refer"
+  split: 'val'
+  data_root: "$YOUR_DATA_DIR"
+  output_path: "./outputs/"
+  prompts_augment: False
+  use_pseudo: True
+sentence_process:
+  mixing_alpha: 0.
+save_path: "./outputs"

configs/refcocog.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+clip:
+  semantic_clip_model_name: 'ViT-B/16'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.5
+  mask_threshold: 0.5
+  confidence_threshold: 0.1
+  clipes_threshold: 0.6
+  color: [255, 0, 0]  # red
+  visual_prompt_type: ['circle', 'blur']
+  min_area_ratio: 0.2
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+           'wall', 'sky', 'lake', 'water', 'river', 'sea',
+           'railway', 'railroad', 'helmet', 'cloud', 'house',
+           'mountain', 'ocean', 'road', 'rock', 'street',
+           'valley', 'bridge']
+test:
+  algo: "car"
+  ds_name: "refcoco+"
+  seg_mode: "refer"
+  splitby: 'umd'
+  split: 'val'
+  data_root: "$YOUR_DATA_DIR"
+  output_path: "./outputs/"
+  prompts_augment: False
+  use_pseudo: True
+sentence_process:
+  mixing_alpha: 0.
+save_path: "./outputs"

configs/voc.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+clip:
+  semantic_clip_model_name: 'ViT-L/14'
+  semantic_pretrained_data: 'openai'
+  clip_model_name: "ViT-B/16"
+  pretrained_data: 'openai'
+car:
+  iom_thres: 0.6
+  mask_threshold: 0.4
+  min_area_ratio: 0.2
+  confidence_threshold: 0.6 # 0.2
+  clipes_threshold: 0.4
+  visualize: False
+  visual_prompt_type: ['circle', 'blur']
+  semantic_templates: ['a clean origami {}.',
+                        'a photo of a {}.',
+                    'This is a photo of a {}',
+                    'There is a {} in the scene',
+                    'There is the {} in the scene',
+                    'a photo of a {} in the scene',
+                    'a photo of a small {}.',
+                    'a photo of a medium {}.',
+                    'a photo of a large {}.',
+                    'This is a photo of a small {}.',
+                    'This is a photo of a medium {}.',
+                    'This is a photo of a large {}.',
+                    'There is a small {} in the scene.',
+                    'There is a medium {} in the scene.',
+                    'There is a large {} in the scene.']
+  bg_cls: ['ground', 'land', 'grass', 'tree', 'building',
+                              'wall', 'sky', 'lake', 'water', 'river', 'sea',
+                              'railway', 'railroad', 'helmet', 'cloud', 'house',
+                              'mountain', 'ocean', 'road', 'rock', 'street',
+                              'valley', 'bridge']
+# SAM is activated only if test.use_pseudo is False
+sam:
+  model_dir: "$YOUR_SAM_MODEL_DIR"
+  sam_checkpoint: "$YOUR_SAM_MODEL_DIR/sam_hq_vit_h.pth"
+  model_type: "vit_h"
+  min_pred_threshold: 0.05
+  points_per_side:
+  pred_iou_thresh: 0.88
+  stability_score_thresh: 0.95
+  box_nms_thresh: 0.7
+test:
+  algo: "car"
+  ds_name: "voc"
+  seg_mode: "semantic"
+  split: 'val'
+  data_root: "$YOUR_DATA_DIR"
+  # You need to extract the sam mask for the ADE dataset if use_pseudo=False
+  sam_mask_root: "$YOUR_SAM_MASK_DIR"
+  output_path: "./outputs/"
+  use_pseudo: True
+  n_class: 21
+  num_chunks: 1
+  chunk_index: 0
+  ignore_background: False
+save_path: "./outputs"

data/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

data/ade.py ADDED Viewed

	@@ -0,0 +1,544 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ADE20K dataset."""
+import os
+import numpy as np
+from PIL import Image
+import torch
+ADE_CLASSES = [
+    'wall',
+    'building, edifice',
+    'sky',
+    'floor, flooring',
+    'tree',
+    'ceiling',
+    'road, route',
+    'bed',
+    'windowpane, window',
+    'grass',
+    'cabinet',
+    'sidewalk, pavement',
+    'person, individual, someone, somebody, mortal, soul',
+    'earth, ground',
+    'door, double, door',
+    'table',
+    'mountain, mount',
+    'plant, flora, plant, life',
+    'curtain, drape, drapery, mantle, pall',
+    'chair',
+    'car, auto, automobile, machine, motorcar',
+    'water',
+    'painting, picture',
+    'sofa, couch, lounge',
+    'shelf',
+    'house',
+    'sea',
+    'mirror',
+    'rug, carpet, carpeting',
+    'field',
+    'armchair',
+    'seat',
+    'fence, fencing',
+    'desk',
+    'rock, stone',
+    'wardrobe, closet, press',
+    'lamp',
+    'bathtub, bathing, tub, bath, tub',
+    'railing, rail',
+    'cushion',
+    'base, pedestal, stand',
+    'box',
+    'column, pillar',
+    'signboard, sign',
+    'chest, of, drawers, chest, bureau, dresser',
+    'counter',
+    'sand',
+    'sink',
+    'skyscraper',
+    'fireplace, hearth, open, fireplace',
+    'refrigerator, icebox',
+    'grandstand, covered, stand',
+    'path',
+    'stairs, steps',
+    'runway',
+    'case, display, case, showcase, vitrine',
+    'pool, table, billiard, table, snooker, table',
+    'pillow',
+    'screen, door, screen',
+    'stairway, staircase',
+    'river',
+    'bridge, span',
+    'bookcase',
+    'blind, screen',
+    'coffee, table, cocktail, table',
+    'toilet, can, commode, crapper, pot, potty, stool, throne',
+    'flower',
+    'book',
+    'hill',
+    'bench',
+    'countertop',
+    'stove, kitchen, stove, range, kitchen, range, cooking, stove',
+    'palm, palm, tree',
+    'kitchen, island',
+    (
+        'computer, computing, machine, computing, device, data, processor,'
+        ' electronic, computer, information, processing, system'
+    ),
+    'swivel, chair',
+    'boat',
+    'bar',
+    'arcade, machine',
+    'hovel, hut, hutch, shack, shanty',
+    (
+        'bus, autobus, coach, charabanc, double-decker, jitney, motorbus,'
+        ' motorcoach, omnibus, passenger, vehicle'
+    ),
+    'towel',
+    'light, light, source',
+    'truck, motortruck',
+    'tower',
+    'chandelier, pendant, pendent',
+    'awning, sunshade, sunblind',
+    'streetlight, street, lamp',
+    'booth, cubicle, stall, kiosk',
+    (
+        'television, television, receiver, television, set, tv, tv, set, idiot,'
+        ' box, boob, tube, telly, goggle, box'
+    ),
+    'airplane, aeroplane, plane',
+    'dirt, track',
+    'apparel, wearing, apparel, dress, clothes',
+    'pole',
+    'land, ground, soil',
+    'bannister, banister, balustrade, balusters, handrail',
+    'escalator, moving, staircase, moving, stairway',
+    'ottoman, pouf, pouffe, puff, hassock',
+    'bottle',
+    'buffet, counter, sideboard',
+    'poster, posting, placard, notice, bill, card',
+    'stage',
+    'van',
+    'ship',
+    'fountain',
+    'conveyer, belt, conveyor, belt, conveyer, conveyor, transporter',
+    'canopy',
+    'washer, automatic, washer, washing, machine',
+    'plaything, toy',
+    'swimming, pool, swimming, bath, natatorium',
+    'stool',
+    'barrel, cask',
+    'basket, handbasket',
+    'waterfall, falls',
+    'tent, collapsible, shelter',
+    'bag',
+    'minibike, motorbike',
+    'cradle',
+    'oven',
+    'ball',
+    'food, solid, food',
+    'step, stair',
+    'tank, storage, tank',
+    'trade, name, brand, name, brand, marque',
+    'microwave, microwave, oven',
+    'pot, flowerpot',
+    'animal, animate, being, beast, brute, creature, fauna',
+    'bicycle, bike, wheel, cycle',
+    'lake',
+    'dishwasher, dish, washer, dishwashing, machine',
+    'screen, silver, screen, projection, screen',
+    'blanket, cover',
+    'sculpture',
+    'hood, exhaust, hood',
+    'sconce',
+    'vase',
+    'traffic, light, traffic, signal, stoplight',
+    'tray',
+    (
+        'ashcan, trash, can, garbage, can, wastebin, ash, bin, ash-bin, ashbin,'
+        ' dustbin, trash, barrel, trash, bin'
+    ),
+    'fan',
+    'pier, wharf, wharfage, dock',
+    'crt, screen',
+    'plate',
+    'monitor, monitoring, device',
+    'bulletin, board, notice, board',
+    'shower',
+    'radiator',
+    'glass, drinking, glass',
+    'clock',
+    'flag',
+]
+ADE_STUFF_CLASS = [
+    'wall',
+    'sky',
+    'floor, flooring',
+    'tree',
+    'ceiling',
+    'road, route',
+    'grass',
+    'earth, ground',
+    'mountain, mount',
+    'plant, flora, plant, life',
+    'water',
+    'sea',
+    'field',
+    'sand',
+    'skyscraper',
+    'path',
+    'river',
+    'bridge, span',
+    'flower',
+    'hill',
+    'land, ground, soil',
+    'dirt, track',
+    'apparel, wearing, apparel, dress, clothes',
+    'lake',
+    'waterfall, falls',
+]
+ADE_THING_CLASS = [
+    'building, edifice',
+    'bed',
+    'windowpane, window',
+    'cabinet',
+    'sidewalk, pavement',
+    'person, individual, someone, somebody, mortal, soul',
+    'door, double, door',
+    'table',
+    'curtain, drape, drapery, mantle, pall',
+    'chair',
+    'car, auto, automobile, machine, motorcar',
+    'painting, picture',
+    'sofa, couch, lounge',
+    'shelf',
+    'house',
+    'mirror',
+    'rug, carpet, carpeting',
+    'armchair',
+    'seat',
+    'fence, fencing',
+    'desk',
+    'rock, stone',
+    'wardrobe, closet, press',
+    'lamp',
+    'bathtub, bathing, tub, bath, tub',
+    'railing, rail',
+    'cushion',
+    'base, pedestal, stand',
+    'box',
+    'column, pillar',
+    'signboard, sign',
+    'chest, of, drawers, chest, bureau, dresser',
+    'counter',
+    'sink',
+    'fireplace, hearth, open, fireplace',
+    'refrigerator, icebox',
+    'grandstand, covered, stand',
+    'stairs, steps',
+    'runway',
+    'case, display, case, showcase, vitrine',
+    'pool, table, billiard, table, snooker, table',
+    'pillow',
+    'screen, door, screen',
+    'stairway, staircase',
+    'bookcase',
+    'blind, screen',
+    'coffee, table, cocktail, table',
+    'toilet, can, commode, crapper, pot, potty, stool, throne',
+    'book',
+    'bench',
+    'countertop',
+    'stove, kitchen, stove, range, kitchen, range, cooking, stove',
+    'palm, palm, tree',
+    'kitchen, island',
+    (
+        'computer, computing, machine, computing, device, data, processor,'
+        ' electronic, computer, information, processing, system'
+    ),
+    'swivel, chair',
+    'boat',
+    'bar',
+    'arcade, machine',
+    'hovel, hut, hutch, shack, shanty',
+    (
+        'bus, autobus, coach, charabanc, double-decker, jitney, motorbus,'
+        ' motorcoach, omnibus, passenger, vehicle'
+    ),
+    'towel',
+    'light, light, source',
+    'truck, motortruck',
+    'tower',
+    'chandelier, pendant, pendent',
+    'awning, sunshade, sunblind',
+    'streetlight, street, lamp',
+    'booth, cubicle, stall, kiosk',
+    (
+        'television, television, receiver, television, set, tv, tv, set, idiot,'
+        ' box, boob, tube, telly, goggle, box'
+    ),
+    'airplane, aeroplane, plane',
+    'pole',
+    'bannister, banister, balustrade, balusters, handrail',
+    'escalator, moving, staircase, moving, stairway',
+    'ottoman, pouf, pouffe, puff, hassock',
+    'bottle',
+    'buffet, counter, sideboard',
+    'poster, posting, placard, notice, bill, card',
+    'stage',
+    'van',
+    'ship',
+    'fountain',
+    'conveyer, belt, conveyor, belt, conveyer, conveyor, transporter',
+    'canopy',
+    'washer, automatic, washer, washing, machine',
+    'plaything, toy',
+    'swimming, pool, swimming, bath, natatorium',
+    'stool',
+    'barrel, cask',
+    'basket, handbasket',
+    'tent, collapsible, shelter',
+    'bag',
+    'minibike, motorbike',
+    'cradle',
+    'oven',
+    'ball',
+    'food, solid, food',
+    'step, stair',
+    'tank, storage, tank',
+    'trade, name, brand, name, brand, marque',
+    'microwave, microwave, oven',
+    'pot, flowerpot',
+    'animal, animate, being, beast, brute, creature, fauna',
+    'bicycle, bike, wheel, cycle',
+    'dishwasher, dish, washer, dishwashing, machine',
+    'screen, silver, screen, projection, screen',
+    'blanket, cover',
+    'sculpture',
+    'hood, exhaust, hood',
+    'sconce',
+    'vase',
+    'traffic, light, traffic, signal, stoplight',
+    'tray',
+    (
+        'ashcan, trash, can, garbage, can, wastebin, ash, bin, ash-bin, ashbin,'
+        ' dustbin, trash, barrel, trash, bin'
+    ),
+    'fan',
+    'pier, wharf, wharfage, dock',
+    'crt, screen',
+    'plate',
+    'monitor, monitoring, device',
+    'bulletin, board, notice, board',
+    'shower',
+    'radiator',
+    'glass, drinking, glass',
+    'clock',
+    'flag',
+]
+ADE_STUFF_CLASS_ID = [
+    0,
+    2,
+    3,
+    4,
+    5,
+    6,
+    9,
+    13,
+    16,
+    17,
+    21,
+    26,
+    29,
+    46,
+    48,
+    52,
+    60,
+    61,
+    66,
+    68,
+    94,
+    91,
+    92,
+    128,
+    113,
+]
+ADE_THING_CLASS_ID = [
+    1,
+    7,
+    8,
+    10,
+    11,
+    12,
+    14,
+    15,
+    18,
+    19,
+    20,
+    22,
+    23,
+    24,
+    25,
+    27,
+    28,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    47,
+    49,
+    50,
+    51,
+    53,
+    54,
+    55,
+    56,
+    57,
+    58,
+    59,
+    62,
+    63,
+    64,
+    65,
+    67,
+    69,
+    70,
+    71,
+    72,
+    73,
+    74,
+    75,
+    76,
+    77,
+    78,
+    79,
+    80,
+    81,
+    82,
+    83,
+    84,
+    85,
+    86,
+    87,
+    88,
+    89,
+    90,
+    93,
+    95,
+    96,
+    97,
+    98,
+    99,
+    100,
+    101,
+    102,
+    103,
+    104,
+    105,
+    106,
+    107,
+    108,
+    109,
+    110,
+    111,
+    112,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    120,
+    121,
+    122,
+    123,
+    124,
+    125,
+    126,
+    127,
+    129,
+    130,
+    131,
+    132,
+    133,
+    134,
+    135,
+    136,
+    137,
+    138,
+    139,
+    140,
+    141,
+    142,
+    143,
+    144,
+    145,
+    146,
+    147,
+    148,
+    149,
+]
+class ADEDataset(torch.utils.data.Dataset):
+  """ADE dataset."""
+  def __init__(self, root, split='validation', transform=None):
+    """Construct ADE dataset.
+    Args:
+      root (string): Root directory where images are downloaded.
+      split (string): The split of the dataset.
+      transform (callable, optional): Optional transform to be applied on a
+        sample.
+    """
+    self.root = root
+    self.image_dir = os.path.join(root, 'images', split)
+    self.ann_dir = os.path.join(root, 'annotations', split)
+    self.images = os.listdir(self.image_dir)
+    self.transform = transform
+  def __getitem__(self, index):
+    img_path = os.path.join(self.image_dir, self.images[index])
+    img = Image.open(img_path).convert('RGB')
+    img = np.asarray(img)
+    idx = self.images[index].split('.')[0]
+    ann_path = os.path.join(self.ann_dir, f'{idx}.png')
+    ann = np.asarray(Image.open(ann_path), dtype=np.int32)
+    return img, img_path, ann, idx
+  def __len__(self):
+    return len(self.images)

data/ade847.py ADDED Viewed

	@@ -0,0 +1,1827 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ADE-847 dataset."""
+import os
+import numpy as np
+from PIL import Image
+# pylint: disable=g-importing-member
+from torch.utils.data import Dataset
+ADE_847_CLASSES = [
+    'wall',
+    'building, edifice',
+    'sky',
+    'tree',
+    'road, route',
+    'floor, flooring',
+    'ceiling',
+    'bed',
+    'sidewalk, pavement',
+    'earth, ground',
+    'cabinet',
+    'person, individual, someone, somebody, mortal, soul',
+    'grass',
+    'windowpane, window',
+    'car, auto, automobile, machine, motorcar',
+    'mountain, mount',
+    'plant, flora, plant life',
+    'table',
+    'chair',
+    'curtain, drape, drapery, mantle, pall',
+    'door',
+    'sofa, couch, lounge',
+    'sea',
+    'painting, picture',
+    'water',
+    'mirror',
+    'house',
+    'rug, carpet, carpeting',
+    'shelf',
+    'armchair',
+    'fence, fencing',
+    'field',
+    'lamp',
+    'rock, stone',
+    'seat',
+    'river',
+    'desk',
+    'bathtub, bathing tub, bath, tub',
+    'railing, rail',
+    'signboard, sign',
+    'cushion',
+    'path',
+    'work surface',
+    'stairs, steps',
+    'column, pillar',
+    'sink',
+    'wardrobe, closet, press',
+    'snow',
+    'refrigerator, icebox',
+    'base, pedestal, stand',
+    'bridge, span',
+    'blind, screen',
+    'runway',
+    'cliff, drop, drop-off',
+    'sand',
+    'fireplace, hearth, open fireplace',
+    'pillow',
+    'screen door, screen',
+    'toilet, can, commode, crapper, pot, potty, stool, throne',
+    'skyscraper',
+    'grandstand, covered stand',
+    'box',
+    'pool table, billiard table, snooker table',
+    'palm, palm tree',
+    'double door',
+    'coffee table, cocktail table',
+    'counter',
+    'countertop',
+    'chest of drawers, chest, bureau, dresser',
+    'kitchen island',
+    'boat',
+    'waterfall, falls',
+    'stove, kitchen stove, range, kitchen range, cooking stove',
+    'flower',
+    'bookcase',
+    'controls',
+    'book',
+    'stairway, staircase',
+    'streetlight, street lamp',
+    (
+        'computer, computing machine, computing device, data processor,'
+        ' electronic computer, information processing system'
+    ),
+    (
+        'bus, autobus, coach, charabanc, double-decker, jitney, motorbus,'
+        ' motorcoach, omnibus, passenger vehicle'
+    ),
+    'swivel chair',
+    'light, light source',
+    'bench',
+    'case, display case, showcase, vitrine',
+    'towel',
+    'fountain',
+    'embankment',
+    (
+        'television receiver, television, television set, tv, tv set, idiot'
+        ' box, boob tube, telly, goggle box'
+    ),
+    'van',
+    'hill',
+    'awning, sunshade, sunblind',
+    'poster, posting, placard, notice, bill, card',
+    'truck, motortruck',
+    'airplane, aeroplane, plane',
+    'pole',
+    'tower',
+    'court',
+    'ball',
+    'aircraft carrier, carrier, flattop, attack aircraft carrier',
+    'buffet, counter, sideboard',
+    'hovel, hut, hutch, shack, shanty',
+    'apparel, wearing apparel, dress, clothes',
+    'minibike, motorbike',
+    'animal, animate being, beast, brute, creature, fauna',
+    'chandelier, pendant, pendent',
+    'step, stair',
+    'booth, cubicle, stall, kiosk',
+    'bicycle, bike, wheel, cycle',
+    'doorframe, doorcase',
+    'sconce',
+    'pond',
+    'trade name, brand name, brand, marque',
+    'bannister, banister, balustrade, balusters, handrail',
+    'bag',
+    'traffic light, traffic signal, stoplight',
+    'gazebo',
+    'escalator, moving staircase, moving stairway',
+    'land, ground, soil',
+    'board, plank',
+    'arcade machine',
+    'eiderdown, duvet, continental quilt',
+    'bar',
+    'stall, stand, sales booth',
+    'playground',
+    'ship',
+    'ottoman, pouf, pouffe, puff, hassock',
+    (
+        'ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin,'
+        ' dustbin, trash barrel, trash bin'
+    ),
+    'bottle',
+    'cradle',
+    'pot, flowerpot',
+    'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+    'train, railroad train',
+    'stool',
+    'lake',
+    'tank, storage tank',
+    'ice, water ice',
+    'basket, handbasket',
+    'manhole',
+    'tent, collapsible shelter',
+    'canopy',
+    'microwave, microwave oven',
+    'barrel, cask',
+    'dirt track',
+    'beam',
+    'dishwasher, dish washer, dishwashing machine',
+    'plate',
+    'screen, crt screen',
+    'ruins',
+    'washer, automatic washer, washing machine',
+    'blanket, cover',
+    'plaything, toy',
+    'food, solid food',
+    'screen, silver screen, projection screen',
+    'oven',
+    'stage',
+    'beacon, lighthouse, beacon light, pharos',
+    'umbrella',
+    'sculpture',
+    'aqueduct',
+    'container',
+    'scaffolding, staging',
+    'hood, exhaust hood',
+    'curb, curbing, kerb',
+    'roller coaster',
+    'horse, equus caballus',
+    'catwalk',
+    'glass, drinking glass',
+    'vase',
+    'central reservation',
+    'carousel',
+    'radiator',
+    'closet',
+    'machine',
+    'pier, wharf, wharfage, dock',
+    'fan',
+    'inflatable bounce game',
+    'pitch',
+    'paper',
+    'arcade, colonnade',
+    'hot tub',
+    'helicopter',
+    'tray',
+    'partition, divider',
+    'vineyard',
+    'bowl',
+    'bullring',
+    'flag',
+    'pot',
+    'footbridge, overcrossing, pedestrian bridge',
+    'shower',
+    'bag, traveling bag, travelling bag, grip, suitcase',
+    'bulletin board, notice board',
+    'confessional booth',
+    'trunk, tree trunk, bole',
+    'forest',
+    'elevator door',
+    'laptop, laptop computer',
+    'instrument panel',
+    'bucket, pail',
+    'tapestry, tapis',
+    'platform',
+    'jacket',
+    'gate',
+    'monitor, monitoring device',
+    'telephone booth, phone booth, call box, telephone box, telephone kiosk',
+    'spotlight, spot',
+    'ring',
+    'control panel',
+    'blackboard, chalkboard',
+    'air conditioner, air conditioning',
+    'chest',
+    'clock',
+    'sand dune',
+    'pipe, pipage, piping',
+    'vault',
+    'table football',
+    'cannon',
+    'swimming pool, swimming bath, natatorium',
+    'fluorescent, fluorescent fixture',
+    'statue',
+    'loudspeaker, speaker, speaker unit, loudspeaker system, speaker system',
+    'exhibitor',
+    'ladder',
+    'carport',
+    'dam',
+    'pulpit',
+    'skylight, fanlight',
+    'water tower',
+    'grill, grille, grillwork',
+    'display board',
+    'pane, pane of glass, window glass',
+    'rubbish, trash, scrap',
+    'ice rink',
+    'fruit',
+    'patio',
+    'vending machine',
+    'telephone, phone, telephone set',
+    'net',
+    'backpack, back pack, knapsack, packsack, rucksack, haversack',
+    'jar',
+    'track',
+    'magazine',
+    'shutter',
+    'roof',
+    'banner, streamer',
+    'landfill',
+    'post',
+    'altarpiece, reredos',
+    'hat, chapeau, lid',
+    'arch, archway',
+    'table game',
+    'bag, handbag, pocketbook, purse',
+    'document, written document, papers',
+    'dome',
+    'pier',
+    'shanties',
+    'forecourt',
+    'crane',
+    'dog, domestic dog, canis familiaris',
+    'piano, pianoforte, forte-piano',
+    'drawing',
+    'cabin',
+    'ad, advertisement, advertizement, advertising, advertizing, advert',
+    'amphitheater, amphitheatre, coliseum',
+    'monument',
+    'henhouse',
+    'cockpit',
+    'heater, warmer',
+    'windmill, aerogenerator, wind generator',
+    'pool',
+    'elevator, lift',
+    'decoration, ornament, ornamentation',
+    'labyrinth',
+    'text, textual matter',
+    'printer',
+    'mezzanine, first balcony',
+    'mattress',
+    'straw',
+    'stalls',
+    'patio, terrace',
+    'billboard, hoarding',
+    'bus stop',
+    'trouser, pant',
+    'console table, console',
+    'rack',
+    'notebook',
+    'shrine',
+    'pantry',
+    'cart',
+    'steam shovel',
+    'porch',
+    'postbox, mailbox, letter box',
+    'figurine, statuette',
+    'recycling bin',
+    'folding screen',
+    'telescope',
+    'deck chair, beach chair',
+    'kennel',
+    'coffee maker',
+    "altar, communion table, lord's table",
+    'fish',
+    'easel',
+    'artificial golf green',
+    'iceberg',
+    'candlestick, candle holder',
+    'shower stall, shower bath',
+    'television stand',
+    (
+        'wall socket, wall plug, electric outlet, electrical outlet, outlet,'
+        ' electric receptacle'
+    ),
+    'skeleton',
+    'grand piano, grand',
+    'candy, confect',
+    'grille door',
+    'pedestal, plinth, footstall',
+    'jersey, t-shirt, tee shirt',
+    'shoe',
+    'gravestone, headstone, tombstone',
+    'shanty',
+    'structure',
+    'rocking chair, rocker',
+    'bird',
+    'place mat',
+    'tomb',
+    'big top',
+    'gas pump, gasoline pump, petrol pump, island dispenser',
+    'lockers',
+    'cage',
+    'finger',
+    'bleachers',
+    'ferris wheel',
+    'hairdresser chair',
+    'mat',
+    'stands',
+    'aquarium, fish tank, marine museum',
+    'streetcar, tram, tramcar, trolley, trolley car',
+    'napkin, table napkin, serviette',
+    'dummy',
+    'booklet, brochure, folder, leaflet, pamphlet',
+    'sand trap',
+    'shop, store',
+    'table cloth',
+    'service station',
+    'coffin',
+    'drawer',
+    'cages',
+    'slot machine, coin machine',
+    'balcony',
+    'volleyball court',
+    'table tennis',
+    'control table',
+    'shirt',
+    'merchandise, ware, product',
+    'railway',
+    'parterre',
+    'chimney',
+    'can, tin, tin can',
+    'tanks',
+    'fabric, cloth, material, textile',
+    'alga, algae',
+    'system',
+    'map',
+    'greenhouse',
+    'mug',
+    'barbecue',
+    'trailer',
+    'toilet tissue, toilet paper, bathroom tissue',
+    'organ',
+    'dishrag, dishcloth',
+    'island',
+    'keyboard',
+    'trench',
+    'basket, basketball hoop, hoop',
+    'steering wheel, wheel',
+    'pitcher, ewer',
+    'goal',
+    'bread, breadstuff, staff of life',
+    'beds',
+    'wood',
+    'file cabinet',
+    'newspaper, paper',
+    'motorboat',
+    'rope',
+    'guitar',
+    'rubble',
+    'scarf',
+    'barrels',
+    'cap',
+    'leaves',
+    'control tower',
+    'dashboard',
+    'bandstand',
+    'lectern',
+    'switch, electric switch, electrical switch',
+    'baseboard, mopboard, skirting board',
+    'shower room',
+    'smoke',
+    'faucet, spigot',
+    'bulldozer',
+    'saucepan',
+    'shops',
+    'meter',
+    'crevasse',
+    'gear',
+    'candelabrum, candelabra',
+    'sofa bed',
+    'tunnel',
+    'pallet',
+    'wire, conducting wire',
+    'kettle, boiler',
+    'bidet',
+    (
+        'baby buggy, baby carriage, carriage, perambulator, pram, stroller,'
+        ' go-cart, pushchair, pusher'
+    ),
+    'music stand',
+    'pipe, tube',
+    'cup',
+    'parking meter',
+    'ice hockey rink',
+    'shelter',
+    'weeds',
+    'temple',
+    'patty, cake',
+    'ski slope',
+    'panel',
+    'wallet',
+    'wheel',
+    'towel rack, towel horse',
+    'roundabout',
+    'canister, cannister, tin',
+    'rod',
+    'soap dispenser',
+    'bell',
+    'canvas',
+    'box office, ticket office, ticket booth',
+    'teacup',
+    'trellis',
+    'workbench',
+    'valley, vale',
+    'toaster',
+    'knife',
+    'podium',
+    'ramp',
+    'tumble dryer',
+    'fireplug, fire hydrant, plug',
+    'gym shoe, sneaker, tennis shoe',
+    'lab bench',
+    'equipment',
+    'rocky formation',
+    'plastic',
+    'calendar',
+    'caravan',
+    'check-in-desk',
+    'ticket counter',
+    'brush',
+    'mill',
+    'covered bridge',
+    'bowling alley',
+    'hanger',
+    'excavator',
+    'trestle',
+    'revolving door',
+    'blast furnace',
+    'scale, weighing machine',
+    'projector',
+    'soap',
+    'locker',
+    'tractor',
+    'stretcher',
+    'frame',
+    'grating',
+    'alembic',
+    'candle, taper, wax light',
+    'barrier',
+    'cardboard',
+    'cave',
+    'puddle',
+    'tarp',
+    'price tag',
+    'watchtower',
+    'meters',
+    (
+        'light bulb, lightbulb, bulb, incandescent lamp, electric light,'
+        ' electric-light bulb'
+    ),
+    'tracks',
+    'hair dryer',
+    'skirt',
+    'viaduct',
+    'paper towel',
+    'coat',
+    'sheet',
+    'fire extinguisher, extinguisher, asphyxiator',
+    'water wheel',
+    'pottery, clayware',
+    'magazine rack',
+    'teapot',
+    'microphone, mike',
+    'support',
+    'forklift',
+    'canyon',
+    'cash register, register',
+    'leaf, leafage, foliage',
+    'remote control, remote',
+    'soap dish',
+    'windshield, windscreen',
+    'cat',
+    'cue, cue stick, pool cue, pool stick',
+    'vent, venthole, vent-hole, blowhole',
+    'videos',
+    'shovel',
+    'eaves',
+    'antenna, aerial, transmitting aerial',
+    'shipyard',
+    'hen, biddy',
+    'traffic cone',
+    'washing machines',
+    'truck crane',
+    'cds',
+    'niche',
+    'scoreboard',
+    'briefcase',
+    'boot',
+    'sweater, jumper',
+    'hay',
+    'pack',
+    'bottle rack',
+    'glacier',
+    'pergola',
+    'building materials',
+    'television camera',
+    'first floor',
+    'rifle',
+    'tennis table',
+    'stadium',
+    'safety belt',
+    'cover',
+    'dish rack',
+    'synthesizer',
+    'pumpkin',
+    'gutter',
+    'fruit stand',
+    'ice floe, floe',
+    'handle, grip, handgrip, hold',
+    'wheelchair',
+    'mousepad, mouse mat',
+    'diploma',
+    'fairground ride',
+    'radio',
+    'hotplate',
+    'junk',
+    'wheelbarrow',
+    'stream',
+    'toll plaza',
+    'punching bag',
+    'trough',
+    'throne',
+    'chair desk',
+    'weighbridge',
+    'extractor fan',
+    'hanging clothes',
+    'dish, dish aerial, dish antenna, saucer',
+    'alarm clock, alarm',
+    'ski lift',
+    'chain',
+    'garage',
+    'mechanical shovel',
+    'wine rack',
+    'tramway',
+    'treadmill',
+    'menu',
+    'block',
+    'well',
+    'witness stand',
+    'branch',
+    'duck',
+    'casserole',
+    'frying pan',
+    'desk organizer',
+    'mast',
+    'spectacles, specs, eyeglasses, glasses',
+    'service elevator',
+    'dollhouse',
+    'hammock',
+    'clothes hanging',
+    'photocopier',
+    'notepad',
+    'golf cart',
+    'footpath',
+    'cross',
+    'baptismal font',
+    'boiler',
+    'skip',
+    'rotisserie',
+    'tables',
+    'water mill',
+    'helmet',
+    'cover curtain',
+    'brick',
+    'table runner',
+    'ashtray',
+    'street box',
+    'stick',
+    'hangers',
+    'cells',
+    'urinal',
+    'centerpiece',
+    'portable fridge',
+    'dvds',
+    'golf club',
+    'skirting board',
+    'water cooler',
+    'clipboard',
+    'camera, photographic camera',
+    'pigeonhole',
+    'chips',
+    'food processor',
+    'post box',
+    'lid',
+    'drum',
+    'blender',
+    'cave entrance',
+    'dental chair',
+    'obelisk',
+    'canoe',
+    'mobile',
+    'monitors',
+    'pool ball',
+    'cue rack',
+    'baggage carts',
+    'shore',
+    'fork',
+    'paper filer',
+    'bicycle rack',
+    'coat rack',
+    'garland',
+    'sports bag',
+    'fish tank',
+    'towel dispenser',
+    'carriage',
+    'brochure',
+    'plaque',
+    'stringer',
+    'iron',
+    'spoon',
+    'flag pole',
+    'toilet brush',
+    'book stand',
+    'water faucet, water tap, tap, hydrant',
+    'ticket office',
+    'broom',
+    'dvd',
+    'ice bucket',
+    'carapace, shell, cuticle, shield',
+    'tureen',
+    'folders',
+    'chess',
+    'root',
+    'sewing machine',
+    'model',
+    'pen',
+    'violin',
+    'sweatshirt',
+    'recycling materials',
+    'mitten',
+    'chopping board, cutting board',
+    'mask',
+    'log',
+    'mouse, computer mouse',
+    'grill',
+    'hole',
+    'target',
+    'trash bag',
+    'chalk',
+    'sticks',
+    'balloon',
+    'score',
+    'hair spray',
+    'roll',
+    'runner',
+    'engine',
+    'inflatable glove',
+    'games',
+    'pallets',
+    'baskets',
+    'coop',
+    'dvd player',
+    'rocking horse',
+    'buckets',
+    'bread rolls',
+    'shawl',
+    'watering can',
+    'spotlights',
+    'post-it',
+    'bowls',
+    'security camera',
+    'runner cloth',
+    'lock',
+    'alarm, warning device, alarm system',
+    'side',
+    'roulette',
+    'bone',
+    'cutlery',
+    'pool balls',
+    'wheels',
+    'spice rack',
+    'plant pots',
+    'towel ring',
+    'bread box',
+    'video',
+    'funfair',
+    'breads',
+    'tripod',
+    'ironing board',
+    'skimmer',
+    'hollow',
+    'scratching post',
+    'tricycle',
+    'file box',
+    'mountain pass',
+    'tombstones',
+    'cooker',
+    'card game, cards',
+    'golf bag',
+    'towel paper',
+    'chaise lounge',
+    'sun',
+    'toilet paper holder',
+    'rake',
+    'key',
+    'umbrella stand',
+    'dartboard',
+    'transformer',
+    'fireplace utensils',
+    'sweatshirts',
+    'cellular telephone, cellular phone, cellphone, cell, mobile phone',
+    'tallboy',
+    'stapler',
+    'sauna',
+    'test tube',
+    'palette',
+    'shopping carts',
+    'tools',
+    'push button, push, button',
+    'star',
+    'roof rack',
+    'barbed wire',
+    'spray',
+    'ear',
+    'sponge',
+    'racket',
+    'tins',
+    'eyeglasses',
+    'file',
+    'scarfs',
+    'sugar bowl',
+    'flip flop',
+    'headstones',
+    'laptop bag',
+    'leash',
+    'climbing frame',
+    'suit hanger',
+    'floor spotlight',
+    'plate rack',
+    'sewer',
+    'hard drive',
+    'sprinkler',
+    'tools box',
+    'necklace',
+    'bulbs',
+    'steel industry',
+    'club',
+    'jack',
+    'door bars',
+    'control panel, instrument panel, control board, board, panel',
+    'hairbrush',
+    'napkin holder',
+    'office',
+    'smoke detector',
+    'utensils',
+    'apron',
+    'scissors',
+    'terminal',
+    'grinder',
+    'entry phone',
+    'newspaper stand',
+    'pepper shaker',
+    'onions',
+    (
+        'central processing unit, cpu, c p u , central processor, processor,'
+        ' mainframe'
+    ),
+    'tape',
+    'bat',
+    'coaster',
+    'calculator',
+    'potatoes',
+    'luggage rack',
+    'salt',
+    'street number',
+    'viewpoint',
+    'sword',
+    'cd',
+    'rowing machine',
+    'plug',
+    'andiron, firedog, dog, dog-iron',
+    'pepper',
+    'tongs',
+    'bonfire',
+    'dog dish',
+    'belt',
+    'dumbbells',
+    'videocassette recorder, vcr',
+    'hook',
+    'envelopes',
+    'shower faucet',
+    'watch',
+    'padlock',
+    'swimming pool ladder',
+    'spanners',
+    'gravy boat',
+    'notice board',
+    'trash bags',
+    'fire alarm',
+    'ladle',
+    'stethoscope',
+    'rocket',
+    'funnel',
+    'bowling pins',
+    'valve',
+    'thermometer',
+    'cups',
+    'spice jar',
+    'night light',
+    'soaps',
+    'games table',
+    'slotted spoon',
+    'reel',
+    'scourer',
+    'sleeping robe',
+    'desk mat',
+    'dumbbell',
+    'hammer',
+    'tie',
+    'typewriter',
+    'shaker',
+    'cheese dish',
+    'sea star',
+    'racquet',
+    'butane gas cylinder',
+    'paper weight',
+    'shaving brush',
+    'sunglasses',
+    'gear shift',
+    'towel rail',
+    'adding machine, totalizer, totaliser',
+]
+ADE_847_CLASS_ID = list(range(847))
+ADE_847_STUFF_CLASS = [
+    'wall',
+    'sky',
+    'tree',
+    'road, route',
+    'floor, flooring',
+    'sidewalk, pavement',
+    'earth, ground',
+    'grass',
+    'mountain, mount',
+    'plant, flora, plant life',
+    'sea',
+    'water',
+    'rock, stone',
+    'snow',
+    'sand',
+    'island',
+    'field',
+    'forest',
+    'land, ground, soil',
+    'lake',
+    'ice, water ice',
+    'cliff, drop, drop-off',
+    'dirt track',
+    'hill',
+    'valley, vale',
+    'stream',
+    'shore',
+    'pond',
+    'iceberg',
+]
+ADE_847_THING_CLASS = [
+    'building, edifice',
+    'ceiling',
+    'bed',
+    'cabinet',
+    'person, individual, someone, somebody, mortal, soul',
+    'windowpane, window',
+    'car, auto, automobile, machine, motorcar',
+    'table',
+    'chair',
+    'curtain, drape, drapery, mantle, pall',
+    'door',
+    'sofa, couch, lounge',
+    'painting, picture',
+    'mirror',
+    'house',
+    'rug, carpet, carpeting',
+    'shelf',
+    'armchair',
+    'fence, fencing',
+    'lamp',
+    'seat',
+    'river',
+    'desk',
+    'bathtub, bathing tub, bath, tub',
+    'railing, rail',
+    'signboard, sign',
+    'cushion',
+    'path',
+    'work surface',
+    'stairs, steps',
+    'column, pillar',
+    'sink',
+    'wardrobe, closet, press',
+    'refrigerator, icebox',
+    'base, pedestal, stand',
+    'bridge, span',
+    'blind, screen',
+    'runway',
+    'fireplace, hearth, open fireplace',
+    'pillow',
+    'screen door, screen',
+    'toilet, can, commode, crapper, pot, potty, stool, throne',
+    'skyscraper',
+    'grandstand, covered stand',
+    'box',
+    'pool table, billiard table, snooker table',
+    'palm, palm tree',
+    'double door',
+    'coffee table, cocktail table',
+    'counter',
+    'countertop',
+    'chest of drawers, chest, bureau, dresser',
+    'kitchen island',
+    'boat',
+    'waterfall, falls',
+    'stove, kitchen stove, range, kitchen range, cooking stove',
+    'flower',
+    'bookcase',
+    'controls',
+    'book',
+    'stairway, staircase',
+    'streetlight, street lamp',
+    (
+        'computer, computing machine, computing device, data processor,'
+        ' electronic computer, information processing system'
+    ),
+    (
+        'bus, autobus, coach, charabanc, double-decker, jitney, motorbus,'
+        ' motorcoach, omnibus, passenger vehicle'
+    ),
+    'swivel chair',
+    'light, light source',
+    'bench',
+    'case, display case, showcase, vitrine',
+    'towel',
+    'fountain',
+    'embankment',
+    (
+        'television receiver, television, television set, tv, tv set, idiot'
+        ' box, boob tube, telly, goggle box'
+    ),
+    'van',
+    'awning, sunshade, sunblind',
+    'poster, posting, placard, notice, bill, card',
+    'truck, motortruck',
+    'airplane, aeroplane, plane',
+    'pole',
+    'tower',
+    'court',
+    'ball',
+    'aircraft carrier, carrier, flattop, attack aircraft carrier',
+    'buffet, counter, sideboard',
+    'hovel, hut, hutch, shack, shanty',
+    'apparel, wearing apparel, dress, clothes',
+    'minibike, motorbike',
+    'animal, animate being, beast, brute, creature, fauna',
+    'chandelier, pendant, pendent',
+    'step, stair',
+    'booth, cubicle, stall, kiosk',
+    'bicycle, bike, wheel, cycle',
+    'doorframe, doorcase',
+    'sconce',
+    'trade name, brand name, brand, marque',
+    'bannister, banister, balustrade, balusters, handrail',
+    'bag',
+    'traffic light, traffic signal, stoplight',
+    'gazebo',
+    'escalator, moving staircase, moving stairway',
+    'board, plank',
+    'arcade machine',
+    'eiderdown, duvet, continental quilt',
+    'bar',
+    'stall, stand, sales booth',
+    'playground',
+    'ship',
+    'ottoman, pouf, pouffe, puff, hassock',
+    (
+        'ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin,'
+        ' dustbin, trash barrel, trash bin'
+    ),
+    'bottle',
+    'cradle',
+    'pot, flowerpot',
+    'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+    'train, railroad train',
+    'stool',
+    'tank, storage tank',
+    'basket, handbasket',
+    'manhole',
+    'tent, collapsible shelter',
+    'canopy',
+    'microwave, microwave oven',
+    'barrel, cask',
+    'beam',
+    'dishwasher, dish washer, dishwashing machine',
+    'plate',
+    'screen, crt screen',
+    'ruins',
+    'washer, automatic washer, washing machine',
+    'blanket, cover',
+    'plaything, toy',
+    'food, solid food',
+    'screen, silver screen, projection screen',
+    'oven',
+    'stage',
+    'beacon, lighthouse, beacon light, pharos',
+    'umbrella',
+    'sculpture',
+    'aqueduct',
+    'container',
+    'scaffolding, staging',
+    'hood, exhaust hood',
+    'curb, curbing, kerb',
+    'roller coaster',
+    'horse, equus caballus',
+    'catwalk',
+    'glass, drinking glass',
+    'vase',
+    'central reservation',
+    'carousel',
+    'radiator',
+    'closet',
+    'machine',
+    'pier, wharf, wharfage, dock',
+    'fan',
+    'inflatable bounce game',
+    'pitch',
+    'paper',
+    'arcade, colonnade',
+    'hot tub',
+    'helicopter',
+    'tray',
+    'partition, divider',
+    'vineyard',
+    'bowl',
+    'bullring',
+    'flag',
+    'pot',
+    'footbridge, overcrossing, pedestrian bridge',
+    'shower',
+    'bag, traveling bag, travelling bag, grip, suitcase',
+    'bulletin board, notice board',
+    'confessional booth',
+    'trunk, tree trunk, bole',
+    'elevator door',
+    'laptop, laptop computer',
+    'instrument panel',
+    'bucket, pail',
+    'tapestry, tapis',
+    'platform',
+    'jacket',
+    'gate',
+    'monitor, monitoring device',
+    'telephone booth, phone booth, call box, telephone box, telephone kiosk',
+    'spotlight, spot',
+    'ring',
+    'control panel',
+    'blackboard, chalkboard',
+    'air conditioner, air conditioning',
+    'chest',
+    'clock',
+    'sand dune',
+    'pipe, pipage, piping',
+    'vault',
+    'table football',
+    'cannon',
+    'swimming pool, swimming bath, natatorium',
+    'fluorescent, fluorescent fixture',
+    'statue',
+    'loudspeaker, speaker, speaker unit, loudspeaker system, speaker system',
+    'exhibitor',
+    'ladder',
+    'carport',
+    'dam',
+    'pulpit',
+    'skylight, fanlight',
+    'water tower',
+    'grill, grille, grillwork',
+    'display board',
+    'pane, pane of glass, window glass',
+    'rubbish, trash, scrap',
+    'ice rink',
+    'fruit',
+    'patio',
+    'vending machine',
+    'telephone, phone, telephone set',
+    'net',
+    'backpack, back pack, knapsack, packsack, rucksack, haversack',
+    'jar',
+    'track',
+    'magazine',
+    'shutter',
+    'roof',
+    'banner, streamer',
+    'landfill',
+    'post',
+    'altarpiece, reredos',
+    'hat, chapeau, lid',
+    'arch, archway',
+    'table game',
+    'bag, handbag, pocketbook, purse',
+    'document, written document, papers',
+    'dome',
+    'pier',
+    'shanties',
+    'forecourt',
+    'crane',
+    'dog, domestic dog, canis familiaris',
+    'piano, pianoforte, forte-piano',
+    'drawing',
+    'cabin',
+    'ad, advertisement, advertizement, advertising, advertizing, advert',
+    'amphitheater, amphitheatre, coliseum',
+    'monument',
+    'henhouse',
+    'cockpit',
+    'heater, warmer',
+    'windmill, aerogenerator, wind generator',
+    'pool',
+    'elevator, lift',
+    'decoration, ornament, ornamentation',
+    'labyrinth',
+    'text, textual matter',
+    'printer',
+    'mezzanine, first balcony',
+    'mattress',
+    'straw',
+    'stalls',
+    'patio, terrace',
+    'billboard, hoarding',
+    'bus stop',
+    'trouser, pant',
+    'console table, console',
+    'rack',
+    'notebook',
+    'shrine',
+    'pantry',
+    'cart',
+    'steam shovel',
+    'porch',
+    'postbox, mailbox, letter box',
+    'figurine, statuette',
+    'recycling bin',
+    'folding screen',
+    'telescope',
+    'deck chair, beach chair',
+    'kennel',
+    'coffee maker',
+    "altar, communion table, lord's table",
+    'fish',
+    'easel',
+    'artificial golf green',
+    'candlestick, candle holder',
+    'shower stall, shower bath',
+    'television stand',
+    (
+        'wall socket, wall plug, electric outlet, electrical outlet, outlet,'
+        ' electric receptacle'
+    ),
+    'skeleton',
+    'grand piano, grand',
+    'candy, confect',
+    'grille door',
+    'pedestal, plinth, footstall',
+    'jersey, t-shirt, tee shirt',
+    'shoe',
+    'gravestone, headstone, tombstone',
+    'shanty',
+    'structure',
+    'rocking chair, rocker',
+    'bird',
+    'place mat',
+    'tomb',
+    'big top',
+    'gas pump, gasoline pump, petrol pump, island dispenser',
+    'lockers',
+    'cage',
+    'finger',
+    'bleachers',
+    'ferris wheel',
+    'hairdresser chair',
+    'mat',
+    'stands',
+    'aquarium, fish tank, marine museum',
+    'streetcar, tram, tramcar, trolley, trolley car',
+    'napkin, table napkin, serviette',
+    'dummy',
+    'booklet, brochure, folder, leaflet, pamphlet',
+    'sand trap',
+    'shop, store',
+    'table cloth',
+    'service station',
+    'coffin',
+    'drawer',
+    'cages',
+    'slot machine, coin machine',
+    'balcony',
+    'volleyball court',
+    'table tennis',
+    'control table',
+    'shirt',
+    'merchandise, ware, product',
+    'railway',
+    'parterre',
+    'chimney',
+    'can, tin, tin can',
+    'tanks',
+    'fabric, cloth, material, textile',
+    'alga, algae',
+    'system',
+    'map',
+    'greenhouse',
+    'mug',
+    'barbecue',
+    'trailer',
+    'toilet tissue, toilet paper, bathroom tissue',
+    'organ',
+    'dishrag, dishcloth',
+    'keyboard',
+    'trench',
+    'basket, basketball hoop, hoop',
+    'steering wheel, wheel',
+    'pitcher, ewer',
+    'goal',
+    'bread, breadstuff, staff of life',
+    'beds',
+    'wood',
+    'file cabinet',
+    'newspaper, paper',
+    'motorboat',
+    'rope',
+    'guitar',
+    'rubble',
+    'scarf',
+    'barrels',
+    'cap',
+    'leaves',
+    'control tower',
+    'dashboard',
+    'bandstand',
+    'lectern',
+    'switch, electric switch, electrical switch',
+    'baseboard, mopboard, skirting board',
+    'shower room',
+    'smoke',
+    'faucet, spigot',
+    'bulldozer',
+    'saucepan',
+    'shops',
+    'meter',
+    'crevasse',
+    'gear',
+    'candelabrum, candelabra',
+    'sofa bed',
+    'tunnel',
+    'pallet',
+    'wire, conducting wire',
+    'kettle, boiler',
+    'bidet',
+    (
+        'baby buggy, baby carriage, carriage, perambulator, pram, stroller,'
+        ' go-cart, pushchair, pusher'
+    ),
+    'music stand',
+    'pipe, tube',
+    'cup',
+    'parking meter',
+    'ice hockey rink',
+    'shelter',
+    'weeds',
+    'temple',
+    'patty, cake',
+    'ski slope',
+    'panel',
+    'wallet',
+    'wheel',
+    'towel rack, towel horse',
+    'roundabout',
+    'canister, cannister, tin',
+    'rod',
+    'soap dispenser',
+    'bell',
+    'canvas',
+    'box office, ticket office, ticket booth',
+    'teacup',
+    'trellis',
+    'workbench',
+    'toaster',
+    'knife',
+    'podium',
+    'ramp',
+    'tumble dryer',
+    'fireplug, fire hydrant, plug',
+    'gym shoe, sneaker, tennis shoe',
+    'lab bench',
+    'equipment',
+    'rocky formation',
+    'plastic',
+    'calendar',
+    'caravan',
+    'check-in-desk',
+    'ticket counter',
+    'brush',
+    'mill',
+    'covered bridge',
+    'bowling alley',
+    'hanger',
+    'excavator',
+    'trestle',
+    'revolving door',
+    'blast furnace',
+    'scale, weighing machine',
+    'projector',
+    'soap',
+    'locker',
+    'tractor',
+    'stretcher',
+    'frame',
+    'grating',
+    'alembic',
+    'candle, taper, wax light',
+    'barrier',
+    'cardboard',
+    'cave',
+    'puddle',
+    'tarp',
+    'price tag',
+    'watchtower',
+    'meters',
+    (
+        'light bulb, lightbulb, bulb, incandescent lamp, electric light,'
+        ' electric-light bulb'
+    ),
+    'tracks',
+    'hair dryer',
+    'skirt',
+    'viaduct',
+    'paper towel',
+    'coat',
+    'sheet',
+    'fire extinguisher, extinguisher, asphyxiator',
+    'water wheel',
+    'pottery, clayware',
+    'magazine rack',
+    'teapot',
+    'microphone, mike',
+    'support',
+    'forklift',
+    'canyon',
+    'cash register, register',
+    'leaf, leafage, foliage',
+    'remote control, remote',
+    'soap dish',
+    'windshield, windscreen',
+    'cat',
+    'cue, cue stick, pool cue, pool stick',
+    'vent, venthole, vent-hole, blowhole',
+    'videos',
+    'shovel',
+    'eaves',
+    'antenna, aerial, transmitting aerial',
+    'shipyard',
+    'hen, biddy',
+    'traffic cone',
+    'washing machines',
+    'truck crane',
+    'cds',
+    'niche',
+    'scoreboard',
+    'briefcase',
+    'boot',
+    'sweater, jumper',
+    'hay',
+    'pack',
+    'bottle rack',
+    'glacier',
+    'pergola',
+    'building materials',
+    'television camera',
+    'first floor',
+    'rifle',
+    'tennis table',
+    'stadium',
+    'safety belt',
+    'cover',
+    'dish rack',
+    'synthesizer',
+    'pumpkin',
+    'gutter',
+    'fruit stand',
+    'ice floe, floe',
+    'handle, grip, handgrip, hold',
+    'wheelchair',
+    'mousepad, mouse mat',
+    'diploma',
+    'fairground ride',
+    'radio',
+    'hotplate',
+    'junk',
+    'wheelbarrow',
+    'toll plaza',
+    'punching bag',
+    'trough',
+    'throne',
+    'chair desk',
+    'weighbridge',
+    'extractor fan',
+    'hanging clothes',
+    'dish, dish aerial, dish antenna, saucer',
+    'alarm clock, alarm',
+    'ski lift',
+    'chain',
+    'garage',
+    'mechanical shovel',
+    'wine rack',
+    'tramway',
+    'treadmill',
+    'menu',
+    'block',
+    'well',
+    'witness stand',
+    'branch',
+    'duck',
+    'casserole',
+    'frying pan',
+    'desk organizer',
+    'mast',
+    'spectacles, specs, eyeglasses, glasses',
+    'service elevator',
+    'dollhouse',
+    'hammock',
+    'clothes hanging',
+    'photocopier',
+    'notepad',
+    'golf cart',
+    'footpath',
+    'cross',
+    'baptismal font',
+    'boiler',
+    'skip',
+    'rotisserie',
+    'tables',
+    'water mill',
+    'helmet',
+    'cover curtain',
+    'brick',
+    'table runner',
+    'ashtray',
+    'street box',
+    'stick',
+    'hangers',
+    'cells',
+    'urinal',
+    'centerpiece',
+    'portable fridge',
+    'dvds',
+    'golf club',
+    'skirting board',
+    'water cooler',
+    'clipboard',
+    'camera, photographic camera',
+    'pigeonhole',
+    'chips',
+    'food processor',
+    'post box',
+    'lid',
+    'drum',
+    'blender',
+    'cave entrance',
+    'dental chair',
+    'obelisk',
+    'canoe',
+    'mobile',
+    'monitors',
+    'pool ball',
+    'cue rack',
+    'baggage carts',
+    'fork',
+    'paper filer',
+    'bicycle rack',
+    'coat rack',
+    'garland',
+    'sports bag',
+    'fish tank',
+    'towel dispenser',
+    'carriage',
+    'brochure',
+    'plaque',
+    'stringer',
+    'iron',
+    'spoon',
+    'flag pole',
+    'toilet brush',
+    'book stand',
+    'water faucet, water tap, tap, hydrant',
+    'ticket office',
+    'broom',
+    'dvd',
+    'ice bucket',
+    'carapace, shell, cuticle, shield',
+    'tureen',
+    'folders',
+    'chess',
+    'root',
+    'sewing machine',
+    'model',
+    'pen',
+    'violin',
+    'sweatshirt',
+    'recycling materials',
+    'mitten',
+    'chopping board, cutting board',
+    'mask',
+    'log',
+    'mouse, computer mouse',
+    'grill',
+    'hole',
+    'target',
+    'trash bag',
+    'chalk',
+    'sticks',
+    'balloon',
+    'score',
+    'hair spray',
+    'roll',
+    'runner',
+    'engine',
+    'inflatable glove',
+    'games',
+    'pallets',
+    'baskets',
+    'coop',
+    'dvd player',
+    'rocking horse',
+    'buckets',
+    'bread rolls',
+    'shawl',
+    'watering can',
+    'spotlights',
+    'post-it',
+    'bowls',
+    'security camera',
+    'runner cloth',
+    'lock',
+    'alarm, warning device, alarm system',
+    'side',
+    'roulette',
+    'bone',
+    'cutlery',
+    'pool balls',
+    'wheels',
+    'spice rack',
+    'plant pots',
+    'towel ring',
+    'bread box',
+    'video',
+    'funfair',
+    'breads',
+    'tripod',
+    'ironing board',
+    'skimmer',
+    'hollow',
+    'scratching post',
+    'tricycle',
+    'file box',
+    'mountain pass',
+    'tombstones',
+    'cooker',
+    'card game, cards',
+    'golf bag',
+    'towel paper',
+    'chaise lounge',
+    'sun',
+    'toilet paper holder',
+    'rake',
+    'key',
+    'umbrella stand',
+    'dartboard',
+    'transformer',
+    'fireplace utensils',
+    'sweatshirts',
+    'cellular telephone, cellular phone, cellphone, cell, mobile phone',
+    'tallboy',
+    'stapler',
+    'sauna',
+    'test tube',
+    'palette',
+    'shopping carts',
+    'tools',
+    'push button, push, button',
+    'star',
+    'roof rack',
+    'barbed wire',
+    'spray',
+    'ear',
+    'sponge',
+    'racket',
+    'tins',
+    'eyeglasses',
+    'file',
+    'scarfs',
+    'sugar bowl',
+    'flip flop',
+    'headstones',
+    'laptop bag',
+    'leash',
+    'climbing frame',
+    'suit hanger',
+    'floor spotlight',
+    'plate rack',
+    'sewer',
+    'hard drive',
+    'sprinkler',
+    'tools box',
+    'necklace',
+    'bulbs',
+    'steel industry',
+    'club',
+    'jack',
+    'door bars',
+    'control panel, instrument panel, control board, board, panel',
+    'hairbrush',
+    'napkin holder',
+    'office',
+    'smoke detector',
+    'utensils',
+    'apron',
+    'scissors',
+    'terminal',
+    'grinder',
+    'entry phone',
+    'newspaper stand',
+    'pepper shaker',
+    'onions',
+    (
+        'central processing unit, cpu, c p u , central processor, processor,'
+        ' mainframe'
+    ),
+    'tape',
+    'bat',
+    'coaster',
+    'calculator',
+    'potatoes',
+    'luggage rack',
+    'salt',
+    'street number',
+    'viewpoint',
+    'sword',
+    'cd',
+    'rowing machine',
+    'plug',
+    'andiron, firedog, dog, dog-iron',
+    'pepper',
+    'tongs',
+    'bonfire',
+    'dog dish',
+    'belt',
+    'dumbbells',
+    'videocassette recorder, vcr',
+    'hook',
+    'envelopes',
+    'shower faucet',
+    'watch',
+    'padlock',
+    'swimming pool ladder',
+    'spanners',
+    'gravy boat',
+    'notice board',
+    'trash bags',
+    'fire alarm',
+    'ladle',
+    'stethoscope',
+    'rocket',
+    'funnel',
+    'bowling pins',
+    'valve',
+    'thermometer',
+    'cups',
+    'spice jar',
+    'night light',
+    'soaps',
+    'games table',
+    'slotted spoon',
+    'reel',
+    'scourer',
+    'sleeping robe',
+    'desk mat',
+    'dumbbell',
+    'hammer',
+    'tie',
+    'typewriter',
+    'shaker',
+    'cheese dish',
+    'sea star',
+    'racquet',
+    'butane gas cylinder',
+    'paper weight',
+    'shaving brush',
+    'sunglasses',
+    'gear shift',
+    'towel rail',
+    'adding machine, totalizer, totaliser',
+]
+ADE_847_STUFF_CLASS_ID = [
+    0, 2, 3, 4, 5, 8, 9, 12, 15, 16, 22, 24, 33, 47, 54, 368, 31, 195, 118, 134,
+    136, 53, 143, 90, 435, 546, 624, 111, 304,
+]
+ADE_847_THING_CLASS_ID = [
+    i for i in ADE_847_CLASS_ID if i not in ADE_847_STUFF_CLASS_ID
+]
+class ADE847Dataset(Dataset):
+  """ADE847 dataset."""
+  def __init__(self, root, split='validation', transform=None):
+    super(ADE847Dataset, self).__init__()
+    self.root = root
+    self.split = split
+    self.transforms = transform
+    self.image_dir = os.path.join(root, 'images_detectron2', split)
+    self.mask_dir = os.path.join(root, 'annotations_detectron2', split)
+    self.images = os.listdir(self.image_dir)
+  def process_mask(self, mask):
+    mask = np.array(mask)
+    mask[mask > 847] = 0
+    return mask
+  def __getitem__(self, index):
+    image_path = os.path.join(self.image_dir, self.images[index])
+    image = Image.open(image_path).convert('RGB')
+    target = (
+        np.asarray(
+            Image.open(
+                os.path.join(
+                    self.mask_dir, self.images[index].replace('jpg', 'tif')
+                )
+            ),
+            dtype=np.int32,
+        )
+        + 1
+    )
+    target = self.process_mask(target)
+    if self.transforms:
+      image = self.transforms(image)
+    return image, image_path, target, index
+  def __len__(self):
+    return len(self.images)

data/coco.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""COCO Stuff Dataset."""
+import os
+import numpy as np
+from PIL import Image
+import torch
+COCO_OBJECT_CLASSES = [
+    'person with clothes,people,human',
+    'bicycle',
+    'car',
+    'motorbike',
+    'aeroplane',
+    'bus',
+    'train',
+    'truck',
+    'boat',
+    'traffic light',
+    'fire hydrant',
+    'stop sign',
+    'parking meter',
+    'bench',
+    'bird avian',
+    'cat',
+    'dog',
+    'horse',
+    'sheep',
+    'cow',
+    'elephant',
+    'bear',
+    'zebra',
+    'giraffe',
+    'backpack,bag',
+    'umbrella,parasol',
+    'handbag,purse',
+    'necktie',
+    'suitcase',
+    'frisbee',
+    'skis',
+    'sknowboard',
+    'sports ball',
+    'kite',
+    'baseball bat',
+    'glove',
+    'skateboard',
+    'surfboard',
+    'tennis racket',
+    'bottle',
+    'wine glass',
+    'cup',
+    'fork',
+    'knife',
+    'dessertspoon',
+    'bowl',
+    'banana',
+    'apple',
+    'sandwich',
+    'orange',
+    'broccoli',
+    'carrot',
+    'hot dog',
+    'pizza',
+    'donut',
+    'cake',
+    'chair seat',
+    'sofa',
+    'pottedplant',
+    'bed',
+    'diningtable',
+    'toilet',
+    'tvmonitor screen',
+    'laptop',
+    'mouse',
+    'remote control',
+    'keyboard',
+    'cell phone',
+    'microwave',
+    'oven',
+    'toaster',
+    'sink',
+    'refrigerator',
+    'book',
+    'clock',
+    'vase',
+    'scissors',
+    'teddy bear',
+    'hairdrier,blowdrier',
+    'toothbrush',
+]
+class COCODataset(torch.utils.data.Dataset):
+  """COCO Object Dataset."""
+  def __init__(self, root, split='val', transform=None):
+    """Construct COCO Object Dataset.
+    Args:
+        root (string): Root directory where images are downloaded.
+        split (string): Path to the annotation file.
+        transform (callable, optional): Optional transform to be applied on a
+        sample.
+    """
+    self.root = root
+    self.image_dir = os.path.join(root, 'images', f'{split}2017')
+    self.ann_dir = os.path.join(root, 'annotations', f'{split}2017')
+    self.images = os.listdir(self.image_dir)
+    self.transform = transform
+  def __getitem__(self, index):
+    img_path = os.path.join(self.image_dir, self.images[index])
+    img = Image.open(img_path).convert('RGB')
+    img = np.asarray(img)
+    idx = self.images[index].split('.')[0]
+    ann_path = os.path.join(self.ann_dir, f'{idx}_instanceTrainIds.png')
+    ann = np.asarray(Image.open(ann_path), dtype=np.int32)
+    return img, img_path, ann, idx
+  def __len__(self):
+    return len(self.images)

data/context.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pascal Context Dataset."""
+from typing import Any, List, Tuple
+import numpy as np
+from PIL import Image
+# pylint: disable=g-importing-member
+from torchvision.datasets.voc import _VOCBase
+PASCAL_CONTEXT_CLASSES = [
+    'airplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird', 'boat',
+    'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat', 'ceiling',
+    'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain', 'dog', 'door',
+    'fence', 'floor', 'flower', 'food', 'grass', 'ground', 'horse', 'keyboard',
+    'light', 'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform',
+    'plant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk', 'sign', 'sky',
+    'snow', 'sofa', 'table', 'track', 'train', 'tree', 'truck', 'monitor',
+    'wall', 'water', 'window', 'wood']
+PASCAL_CONTEXT_STUFF_CLASS = [
+    'bedclothes', 'ceiling', 'cloth', 'curtain', 'floor', 'grass', 'ground',
+    'light', 'mountain', 'platform', 'road', 'sidewalk', 'sky', 'snow', 'wall',
+    'water', 'window', 'wood', 'door', 'fence', 'rock']
+PASCAL_CONTEXT_THING_CLASS = [
+    'airplane', 'bag', 'bed', 'bench', 'bicycle', 'bird', 'boat', 'book',
+    'bottle', 'building', 'bus', 'cabinet', 'car', 'cat', 'chair', 'computer',
+    'cow', 'cup', 'dog', 'flower', 'food', 'horse', 'keyboard', 'motorbike',
+    'mouse', 'person', 'plate', 'plant', 'sheep', 'shelves', 'sign', 'sofa',
+    'table', 'track', 'train', 'tree', 'truck', 'monitor']
+PASCAL_CONTEXT_STUFF_CLASS_ID = [
+    3, 15, 17, 21, 25, 28, 29, 32, 34, 38, 40, 44, 46, 47, 55, 56, 57, 58, 23,
+    24, 41]
+PASCAL_CONTEXT_THING_CLASS_ID = [
+    0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 26, 27,
+    30, 31, 33, 35, 36, 37, 39, 42, 43, 45, 48, 49, 50, 51, 52, 53, 54]
+class CONTEXTSegmentation(_VOCBase):
+  """Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/> Segmentation Dataset.
+  Attributes:
+      root (string): Root directory of the VOC Dataset.
+      year (string, optional): The dataset year, supports years ``"2007"`` to
+        ``"2012"``.
+      image_set (string, optional): Select the image_set to use, ``"train"``,
+        ``"trainval"`` or ``"val"``. If ``year=="2007"``, can also be
+        ``"test"``.
+      download (bool, optional): If true, downloads the dataset from the
+        internet and puts it in root directory. If dataset is already
+        downloaded, it is not downloaded again.
+      transform (callable, optional): A function/transform that  takes in an PIL
+        image and returns a transformed version. E.g, ``transforms.RandomCrop``
+      target_transform (callable, optional): A function/transform that takes in
+        the target and transforms it.
+      transforms (callable, optional): A function/transform that takes input
+        sample and its target as entry and returns a transformed version.
+  """
+  _SPLITS_DIR = 'SegmentationContext'
+  _TARGET_DIR = 'SegmentationClassContext'
+  _TARGET_FILE_EXT = '.png'
+  @property
+  def masks(self):
+    return self.targets
+  def __getitem__(self, index):
+    """Get a sample of image and segmentation.
+    Args:
+      index (int): Index
+    Returns:
+      tuple: (image, target) where target is the image segmentation.
+    """
+    img = Image.open(self.images[index]).convert('RGB')
+    target = Image.open(self.masks[index])
+    if self.transforms is not None:
+      img, target = self.transforms(img, target)
+    return img, target
+class CONTEXTDataset(CONTEXTSegmentation):
+  """Pascal Context Dataset."""
+  def __init__(self, root, year='2012', split='val', transform=None):
+    super(CONTEXTDataset, self).__init__(
+        root=root,
+        image_set=split,
+        year=year,
+        transform=transform,
+        download=False,
+    )
+    # self.idx_to_class = {val: key for (key, val) in CLASS2ID.items()}
+  def __getitem__(self, index):
+    image_path = self.images[index]
+    image = Image.open(image_path).convert('RGB')
+    target = np.asarray(Image.open(self.masks[index]), dtype=np.int32)
+    # transpose the target width and height
+    # target = target.transpose(1, 0)
+    if self.transforms:
+      image = self.transform(image)
+    return image, str(image_path), target, index

data/gres.py ADDED Viewed

	@@ -0,0 +1,455 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""grefer v0.1.
+This interface provides access to gRefCOCO.
+The following API functions are defined:
+G_REFER      - REFER api class
+getRefIds    - get ref ids that satisfy given filter conditions.
+getAnnIds    - get ann ids that satisfy given filter conditions.
+getImgIds    - get image ids that satisfy given filter conditions.
+getCatIds    - get category ids that satisfy given filter conditions.
+loadRefs     - load refs with the specified ref ids.
+loadAnns     - load anns with the specified ann ids.
+loadImgs     - load images with the specified image ids.
+loadCats     - load category names with the specified category ids.
+getRefBox    - get ref's bounding box [x, y, w, h] given the ref_id
+showRef      - show image, segmentation or box of the referred object with the
+               ref
+getMaskByRef - get mask and area of the referred object given ref or ref ids
+getMask      - get mask and area of the referred object given ref
+showMask     - show mask of the referred object given ref
+"""
+# Adapted from
+# https://github.com/yz93/LAVT-RIS/blob/main/data/dataset_refer_bert.py
+# pylint: disable=all
+import itertools
+import json
+import os
+import os.path as osp
+import pickle
+import time
+# pylint: disable=g-importing-member
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+from matplotlib.patches import Rectangle
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from pycocotools import mask
+from skimage import io
+import torch
+from torch.utils import data
+class G_REFER:
+  """GRES dataset."""
+  def __init__(self, data_root, dataset='grefcoco', splitBy='unc'):
+    # provide data_root folder which contains grefcoco
+    print('loading dataset %s into memory...' % dataset)
+    self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+    self.DATA_DIR = osp.join(data_root, dataset)
+    if dataset in ['grefcoco']:
+      self.IMAGE_DIR = osp.join(data_root, 'images/mscoco/images/train2014')
+    else:
+      raise KeyError('No refer dataset is called [%s]' % dataset)
+    tic = time.time()
+    # load refs from data/dataset/refs(dataset).json
+    self.data = {}
+    self.data['dataset'] = dataset
+    ref_file = osp.join(self.DATA_DIR, f'grefs({splitBy}).p')
+    if osp.exists(ref_file):
+      self.data['refs'] = pickle.load(open(ref_file, 'rb'), fix_imports=True)
+    else:
+      ref_file = osp.join(self.DATA_DIR, f'grefs({splitBy}).json')
+      if osp.exists(ref_file):
+        self.data['refs'] = json.load(open(ref_file, 'rb'))
+      else:
+        raise FileNotFoundError('JSON file not found')
+    # load annotations from data/dataset/instances.json
+    instances_file = osp.join(self.DATA_DIR, 'instances.json')
+    instances = json.load(open(instances_file, 'r'))
+    self.data['images'] = instances['images']
+    self.data['annotations'] = instances['annotations']
+    self.data['categories'] = instances['categories']
+    # create index
+    self.createIndex()
+    print('DONE (t=%.2fs)' % (time.time() - tic))
+  @staticmethod
+  def _toList(x):
+    return x if isinstance(x, list) else [x]
+  @staticmethod
+  def match_any(a, b):
+    a = a if isinstance(a, list) else [a]
+    b = b if isinstance(b, list) else [b]
+    return set(a) & set(b)
+  def createIndex(self):
+    # create sets of mapping
+    # 1)  Refs: 	 	{ref_id: ref}
+    # 2)  Anns: 	 	{ann_id: ann}
+    # 3)  Imgs:		 	{image_id: image}
+    # 4)  Cats: 	 	{category_id: category_name}
+    # 5)  Sents:     	{sent_id: sent}
+    # 6)  imgToRefs: 	{image_id: refs}
+    # 7)  imgToAnns: 	{image_id: anns}
+    # 8)  refToAnn:  	{ref_id: ann}
+    # 9)  annToRef:  	{ann_id: ref}
+    # 10) catToRefs: 	{category_id: refs}
+    # 11) sentToRef: 	{sent_id: ref}
+    # 12) sentToTokens: {sent_id: tokens}
+    print('creating index...')
+    # fetch info from instances
+    Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+    Anns[-1] = None
+    for ann in self.data['annotations']:
+      Anns[ann['id']] = ann
+      imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
+    for img in self.data['images']:
+      Imgs[img['id']] = img
+    for cat in self.data['categories']:
+      Cats[cat['id']] = cat['name']
+    # fetch info from refs
+    Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+    Sents, sentToRef, sentToTokens = {}, {}, {}
+    availableSplits = []
+    for ref in self.data['refs']:
+      # ids
+      ref_id = ref['ref_id']
+      ann_id = ref['ann_id']
+      category_id = ref['category_id']
+      image_id = ref['image_id']
+      if ref['split'] not in availableSplits:
+        availableSplits.append(ref['split'])
+      # add mapping related to ref
+      if ref_id in Refs:
+        print('Duplicate ref id')
+      Refs[ref_id] = ref
+      imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+      category_id = self._toList(category_id)
+      added_cats = []
+      for cat in category_id:
+        if cat not in added_cats:
+          added_cats.append(cat)
+          catToRefs[cat] = catToRefs.get(cat, []) + [ref]
+      ann_id = self._toList(ann_id)
+      refToAnn[ref_id] = [Anns[ann] for ann in ann_id]
+      for ann_id_n in ann_id:
+        annToRef[ann_id_n] = annToRef.get(ann_id_n, []) + [ref]
+      # add mapping of sent
+      for sent in ref['sentences']:
+        Sents[sent['sent_id']] = sent
+        sentToRef[sent['sent_id']] = ref
+        sentToTokens[sent['sent_id']] = sent['tokens']
+    # create class members
+    self.Refs = Refs
+    self.Anns = Anns
+    self.Imgs = Imgs
+    self.Cats = Cats
+    self.Sents = Sents
+    self.imgToRefs = imgToRefs
+    self.imgToAnns = imgToAnns
+    self.refToAnn = refToAnn
+    self.annToRef = annToRef
+    self.catToRefs = catToRefs
+    self.sentToRef = sentToRef
+    self.sentToTokens = sentToTokens
+    self.availableSplits = availableSplits
+    print('index created.')
+  def getRefIds(self, image_ids=[], cat_ids=[], split=[]):
+    image_ids = self._toList(image_ids)
+    cat_ids = self._toList(cat_ids)
+    split = self._toList(split)
+    for s in split:
+      if s not in self.availableSplits:
+        raise ValueError(f'Invalid split name: {s}')
+    refs = self.data['refs']
+    if len(image_ids) > 0:
+      lists = [self.imgToRefs[image_id] for image_id in image_ids]
+      refs = list(itertools.chain.from_iterable(lists))
+    if len(cat_ids) > 0:
+      refs = [
+          ref for ref in refs if self.match_any(ref['category_id'], cat_ids)
+      ]
+    if len(split) > 0:
+      refs = [ref for ref in refs if ref['split'] in split]
+    ref_ids = [ref['ref_id'] for ref in refs]
+    return ref_ids
+  def getAnnIds(self, image_ids=[], ref_ids=[]):
+    image_ids = self._toList(image_ids)
+    ref_ids = self._toList(ref_ids)
+    if any([len(image_ids), len(ref_ids)]):
+      if len(image_ids) > 0:
+        lists = [
+            self.imgToAnns[image_id]
+            for image_id in image_ids
+            if image_id in self.imgToAnns
+        ]
+        anns = list(itertools.chain.from_iterable(lists))
+      else:
+        anns = self.data['annotations']
+      ann_ids = [ann['id'] for ann in anns]
+      if len(ref_ids) > 0:
+        lists = [self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]
+        anns_by_ref_id = list(itertools.chain.from_iterable(lists))
+        ann_ids = list(set(ann_ids).intersection(set(anns_by_ref_id)))
+    else:
+      ann_ids = [ann['id'] for ann in self.data['annotations']]
+    return ann_ids
+  def getImgIds(self, ref_ids=[]):
+    ref_ids = self._toList(ref_ids)
+    if len(ref_ids) > 0:
+      image_ids = list(
+          set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids])
+      )
+    else:
+      image_ids = self.Imgs.keys()
+    return image_ids
+  def getCatIds(self):
+    return self.Cats.keys()
+  def loadRefs(self, ref_ids=[]):
+    return [self.Refs[ref_id] for ref_id in self._toList(ref_ids)]
+  def loadAnns(self, ann_ids=[]):
+    if isinstance(ann_ids, str):
+      ann_ids = int(ann_ids)
+    return [self.Anns[ann_id] for ann_id in self._toList(ann_ids)]
+  def loadImgs(self, image_ids=[]):
+    return [self.Imgs[image_id] for image_id in self._toList(image_ids)]
+  def loadCats(self, cat_ids=[]):
+    return [self.Cats[cat_id] for cat_id in self._toList(cat_ids)]
+  def getRefBox(self, ref_id):
+    anns = self.refToAnn[ref_id]
+    return [ann['bbox'] for ann in anns]  # [x, y, w, h]
+  def showRef(self, ref, seg_box='seg'):
+    ax = plt.gca()
+    # show image
+    image = self.Imgs[ref['image_id']]
+    I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+    ax.imshow(I)
+    # show refer expression
+    for sid, sent in enumerate(ref['sentences']):
+      print('%s. %s' % (sid + 1, sent['sent']))
+    # show segmentations
+    if seg_box == 'seg':
+      ann_id = ref['ann_id']
+      ann = self.Anns[ann_id]
+      polygons = []
+      color = []
+      c = 'none'
+      if type(ann['segmentation'][0]) == list:
+        # polygon used for refcoco*
+        for seg in ann['segmentation']:
+          poly = np.array(seg).reshape((len(seg) / 2, 2))
+          polygons.append(Polygon(poly, True, alpha=0.4))
+          color.append(c)
+        p = PatchCollection(
+            polygons,
+            facecolors=color,
+            edgecolors=(1, 1, 0, 0),
+            linewidths=3,
+            alpha=1,
+        )
+        ax.add_collection(p)  # thick yellow polygon
+        p = PatchCollection(
+            polygons,
+            facecolors=color,
+            edgecolors=(1, 0, 0, 0),
+            linewidths=1,
+            alpha=1,
+        )
+        ax.add_collection(p)  # thin red polygon
+      else:
+        # mask used for refclef
+        rle = ann['segmentation']
+        m = mask.decode(rle)
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.array([2.0, 166.0, 101.0]) / 255
+        for i in range(3):
+          img[:, :, i] = color_mask[i]
+        ax.imshow(np.dstack((img, m * 0.5)))
+    # show bounding-box
+    elif seg_box == 'box':
+      # ann_id = ref['ann_id']
+      # ann = self.Anns[ann_id]
+      bbox = self.getRefBox(ref['ref_id'])
+      box_plot = Rectangle(
+          (bbox[0], bbox[1]),
+          bbox[2],
+          bbox[3],
+          fill=False,
+          edgecolor='green',
+          linewidth=3,
+      )
+      ax.add_patch(box_plot)
+  def getMask(self, ann):
+    if not ann:
+      return None
+    if ann['iscrowd']:
+      raise ValueError('Crowd object')
+    image = self.Imgs[ann['image_id']]
+    if type(ann['segmentation'][0]) == list:  # polygon
+      rle = mask.frPyObjects(
+          ann['segmentation'], image['height'], image['width']
+      )
+    else:
+      rle = ann['segmentation']
+    m = mask.decode(rle)
+    # sometimes there are multiple binary map (corresponding to multiple segs)
+    m = np.sum(m, axis=2)
+    m = m.astype(np.uint8)  # convert to np.uint8
+    # compute area
+    area = sum(mask.area(rle))  # should be close to ann['area']
+    return {'mask': m, 'area': area}
+  def getMaskByRef(self, ref=None, ref_id=None, merge=False):
+    if not ref and not ref_id:
+      raise ValueError
+    if ref:
+      ann_ids = ref['ann_id']
+      ref_id = ref['ref_id']
+    else:
+      ann_ids = self.getAnnIds(ref_ids=ref_id)
+    if ann_ids == [-1]:
+      img = self.Imgs[self.Refs[ref_id]['image_id']]
+      return {
+          'mask': np.zeros([img['height'], img['width']], dtype=np.uint8),
+          'empty': True,
+      }
+    anns = self.loadAnns(ann_ids)
+    mask_list = [self.getMask(ann) for ann in anns if not ann['iscrowd']]
+    if merge:
+      merged_masks = sum([mask['mask'] for mask in mask_list])
+      merged_masks[np.where(merged_masks > 1)] = 1
+      return {'mask': merged_masks, 'empty': False}
+    else:
+      return mask_list
+  def showMask(self, ref):
+    M = self.getMask(ref)
+    msk = M['mask']
+    ax = plt.gca()
+    ax.imshow(msk)
+class GReferDataset(data.Dataset):
+  def __init__(self, root, transform=None, split='val'):
+    self.classes = []
+    self.image_transforms = transform
+    self.split = split
+    self.refer = G_REFER(root)
+    ref_ids = self.refer.getRefIds(split=self.split)
+    img_ids = self.refer.getImgIds(ref_ids)
+    all_imgs = self.refer.Imgs
+    self.imgs = list(all_imgs[i] for i in img_ids)
+    self.ref_ids = []
+    # print(len(ref_ids))
+    # print(len(self.imgs))
+    self.sentence_raw = []
+    # if we are testing on a dataset, test all sentences of an object;
+    # o/w, we are validating during training, randomly sample one sentence
+    # for efficiency
+    for r in ref_ids:
+      ref = self.refer.Refs[r]
+      # ref_sentences = []
+      # for i, (el, sent_id) in enumerate(zip(ref['sentences'],
+      #                                       ref['sent_ids'])):
+      for el in ref['sentences']:
+        sentence_raw = el['raw']
+        if len(sentence_raw) == 0:
+          continue
+        self.sentence_raw.append(sentence_raw)
+        self.ref_ids.append(r)
+    # print(len(self.sentence_raw))
+  def get_classes(self):
+    return self.classes
+  def __len__(self):
+    return len(self.ref_ids)
+  def __getitem__(self, index):
+    this_ref_id = self.ref_ids[index]
+    this_img_id = self.refer.getImgIds(this_ref_id)
+    this_img = self.refer.Imgs[this_img_id[0]]
+    # print(this_ref_id, this_img_id)
+    # print(len(self.ref_ids))
+    img_path = os.path.join(self.refer.IMAGE_DIR, this_img['file_name'])
+    img = Image.open(img_path).convert('RGB')
+    ref = self.refer.loadRefs(this_ref_id)
+    # print("ref",ref)
+    ref_mask_ann = self.refer.getMaskByRef(ref[0])
+    if type(ref_mask_ann) == list:
+      ref_mask_ann = ref_mask_ann[0]
+    ref_mask = ref_mask_ann['mask']
+    annot = np.zeros(ref_mask.shape)
+    annot[ref_mask == 1] = 1
+    target = Image.fromarray(annot.astype(np.uint8), mode='P')
+    # print(np.array(target), np.unique(np.array(target).flatten()))
+    if self.image_transforms is not None:
+      # resize, from PIL to tensor, and mean and std normalization
+      img = self.image_transforms(img)
+      # target = self.target_transforms(target)
+      target = torch.as_tensor(np.array(target, copy=True))
+      # target = target.permute((2, 0, 1))
+    sentence = self.sentence_raw[index]
+    return img, img_path, target, sentence

data/pascal459.py ADDED Viewed

	@@ -0,0 +1,998 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pascal-459 Dataset."""
+import os
+import numpy as np
+from PIL import Image
+# pylint: disable=g-importing-member
+from torch.utils.data import Dataset
+PASCAL_459_CLASSES = [
+    'accordion',
+    'aeroplane',
+    'air conditioner',
+    'antenna',
+    'artillery',
+    'ashtray',
+    'atrium',
+    'baby carriage',
+    'bag',
+    'ball',
+    'balloon',
+    'bamboo weaving',
+    'barrel',
+    'baseball bat',
+    'basket',
+    'basketball backboard',
+    'bathtub',
+    'bed',
+    'bedclothes',
+    'beer',
+    'bell',
+    'bench',
+    'bicycle',
+    'binoculars',
+    'bird',
+    'bird cage',
+    'bird feeder',
+    'bird nest',
+    'blackboard',
+    'board',
+    'boat',
+    'bone',
+    'book',
+    'bottle',
+    'bottle opener',
+    'bowl',
+    'box',
+    'bracelet',
+    'brick',
+    'bridge',
+    'broom',
+    'brush',
+    'bucket',
+    'building',
+    'bus',
+    'cabinet',
+    'cabinet door',
+    'cage',
+    'cake',
+    'calculator',
+    'calendar',
+    'camel',
+    'camera',
+    'camera lens',
+    'can',
+    'candle',
+    'candle holder',
+    'cap',
+    'car',
+    'card',
+    'cart',
+    'case',
+    'casette recorder',
+    'cash register',
+    'cat',
+    'cd',
+    'cd player',
+    'ceiling',
+    'cell phone',
+    'cello',
+    'chain',
+    'chair',
+    'chessboard',
+    'chicken',
+    'chopstick',
+    'clip',
+    'clippers',
+    'clock',
+    'closet',
+    'cloth',
+    'clothes tree',
+    'coffee',
+    'coffee machine',
+    'comb',
+    'computer',
+    'concrete',
+    'cone',
+    'container',
+    'control booth',
+    'controller',
+    'cooker',
+    'copying machine',
+    'coral',
+    'cork',
+    'corkscrew',
+    'counter',
+    'court',
+    'cow',
+    'crabstick',
+    'crane',
+    'crate',
+    'cross',
+    'crutch',
+    'cup',
+    'curtain',
+    'cushion',
+    'cutting board',
+    'dais',
+    'disc',
+    'disc case',
+    'dishwasher',
+    'dock',
+    'dog',
+    'dolphin',
+    'door',
+    'drainer',
+    'dray',
+    'drink dispenser',
+    'drinking machine',
+    'drop',
+    'drug',
+    'drum',
+    'drum kit',
+    'duck',
+    'dumbbell',
+    'earphone',
+    'earrings',
+    'egg',
+    'electric fan',
+    'electric iron',
+    'electric pot',
+    'electric saw',
+    'electronic keyboard',
+    'engine',
+    'envelope',
+    'equipment',
+    'escalator',
+    'exhibition booth',
+    'extinguisher',
+    'eyeglass',
+    'fan',
+    'faucet',
+    'fax machine',
+    'fence',
+    'ferris wheel',
+    'fire extinguisher',
+    'fire hydrant',
+    'fire place',
+    'fish',
+    'fish tank',
+    'fishbowl',
+    'fishing net',
+    'fishing pole',
+    'flag',
+    'flagstaff',
+    'flame',
+    'flashlight',
+    'floor',
+    'flower',
+    'fly',
+    'foam',
+    'food',
+    'footbridge',
+    'forceps',
+    'fork',
+    'forklift',
+    'fountain',
+    'fox',
+    'frame',
+    'fridge',
+    'frog',
+    'fruit',
+    'funnel',
+    'furnace',
+    'game controller',
+    'game machine',
+    'gas cylinder',
+    'gas hood',
+    'gas stove',
+    'gift box',
+    'glass',
+    'glass marble',
+    'globe',
+    'glove',
+    'goal',
+    'grandstand',
+    'grass',
+    'gravestone',
+    'ground',
+    'guardrail',
+    'guitar',
+    'gun',
+    'hammer',
+    'hand cart',
+    'handle',
+    'handrail',
+    'hanger',
+    'hard disk drive',
+    'hat',
+    'hay',
+    'headphone',
+    'heater',
+    'helicopter',
+    'helmet',
+    'holder',
+    'hook',
+    'horse',
+    'horse-drawn carriage',
+    'hot-air balloon',
+    'hydrovalve',
+    'ice',
+    'inflator pump',
+    'ipod',
+    'iron',
+    'ironing board',
+    'jar',
+    'kart',
+    'kettle',
+    'key',
+    'keyboard',
+    'kitchen range',
+    'kite',
+    'knife',
+    'knife block',
+    'ladder',
+    'ladder truck',
+    'ladle',
+    'laptop',
+    'leaves',
+    'lid',
+    'life buoy',
+    'light',
+    'light bulb',
+    'lighter',
+    'line',
+    'lion',
+    'lobster',
+    'lock',
+    'machine',
+    'mailbox',
+    'mannequin',
+    'map',
+    'mask',
+    'mat',
+    'match book',
+    'mattress',
+    'menu',
+    'metal',
+    'meter box',
+    'microphone',
+    'microwave',
+    'mirror',
+    'missile',
+    'model',
+    'money',
+    'monkey',
+    'mop',
+    'motorbike',
+    'mountain',
+    'mouse',
+    'mouse pad',
+    'musical instrument',
+    'napkin',
+    'net',
+    'newspaper',
+    'oar',
+    'ornament',
+    'outlet',
+    'oven',
+    'oxygen bottle',
+    'pack',
+    'pan',
+    'paper',
+    'paper box',
+    'paper cutter',
+    'parachute',
+    'parasol',
+    'parterre',
+    'patio',
+    'pelage',
+    'pen',
+    'pen container',
+    'pencil',
+    'person',
+    'photo',
+    'piano',
+    'picture',
+    'pig',
+    'pillar',
+    'pillow',
+    'pipe',
+    'pitcher',
+    'plant',
+    'plastic',
+    'plate',
+    'platform',
+    'player',
+    'playground',
+    'pliers',
+    'plume',
+    'poker',
+    'poker chip',
+    'pole',
+    'pool table',
+    'postcard',
+    'poster',
+    'pot',
+    'pottedplant',
+    'printer',
+    'projector',
+    'pumpkin',
+    'rabbit',
+    'racket',
+    'radiator',
+    'radio',
+    'rail',
+    'rake',
+    'ramp',
+    'range hood',
+    'receiver',
+    'recorder',
+    'recreational machines',
+    'remote control',
+    'road',
+    'robot',
+    'rock',
+    'rocket',
+    'rocking horse',
+    'rope',
+    'rug',
+    'ruler',
+    'runway',
+    'saddle',
+    'sand',
+    'saw',
+    'scale',
+    'scanner',
+    'scissors',
+    'scoop',
+    'screen',
+    'screwdriver',
+    'sculpture',
+    'scythe',
+    'sewer',
+    'sewing machine',
+    'shed',
+    'sheep',
+    'shell',
+    'shelves',
+    'shoe',
+    'shopping cart',
+    'shovel',
+    'sidecar',
+    'sidewalk',
+    'sign',
+    'signal light',
+    'sink',
+    'skateboard',
+    'ski',
+    'sky',
+    'sled',
+    'slippers',
+    'smoke',
+    'snail',
+    'snake',
+    'snow',
+    'snowmobiles',
+    'sofa',
+    'spanner',
+    'spatula',
+    'speaker',
+    'speed bump',
+    'spice container',
+    'spoon',
+    'sprayer',
+    'squirrel',
+    'stage',
+    'stair',
+    'stapler',
+    'stick',
+    'sticky note',
+    'stone',
+    'stool',
+    'stove',
+    'straw',
+    'stretcher',
+    'sun',
+    'sunglass',
+    'sunshade',
+    'surveillance camera',
+    'swan',
+    'sweeper',
+    'swim ring',
+    'swimming pool',
+    'swing',
+    'switch',
+    'table',
+    'tableware',
+    'tank',
+    'tap',
+    'tape',
+    'tarp',
+    'telephone',
+    'telephone booth',
+    'tent',
+    'tire',
+    'toaster',
+    'toilet',
+    'tong',
+    'tool',
+    'toothbrush',
+    'towel',
+    'toy',
+    'toy car',
+    'track',
+    'train',
+    'trampoline',
+    'trash bin',
+    'tray',
+    'tree',
+    'tricycle',
+    'tripod',
+    'trophy',
+    'truck',
+    'tube',
+    'turtle',
+    'tvmonitor',
+    'tweezers',
+    'typewriter',
+    'umbrella',
+    'unknown',
+    'vacuum cleaner',
+    'vending machine',
+    'video camera',
+    'video game console',
+    'video player',
+    'video tape',
+    'violin',
+    'wakeboard',
+    'wall',
+    'wallet',
+    'wardrobe',
+    'washing machine',
+    'watch',
+    'water',
+    'water dispenser',
+    'water pipe',
+    'water skate board',
+    'watermelon',
+    'whale',
+    'wharf',
+    'wheel',
+    'wheelchair',
+    'window',
+    'window blinds',
+    'wineglass',
+    'wire',
+    'wood',
+    'wool',
+]
+PASCAL_459_CLASSE_ID = list(range(459))
+PASCAL_459_STUFF_CLASS = [
+    'atrium',
+    'ceiling',
+    'concrete',
+    'coral',
+    'court',
+    'dock',
+    'floor',
+    'foam',
+    'grass',
+    'ground',
+    'ice',
+    'leaves',
+    'mountain',
+    'parterre',
+    'patio',
+    'road',
+    'rock',
+    'rug',
+    'sand',
+    'sky',
+    'snow',
+    'stone',
+    'sun',
+    'wall',
+    'water',
+    'wood',
+]
+PASCAL_459_THING_CLASS = [
+    'accordion',
+    'aeroplane',
+    'air conditioner',
+    'antenna',
+    'artillery',
+    'ashtray',
+    'baby carriage',
+    'bag',
+    'ball',
+    'balloon',
+    'bamboo weaving',
+    'barrel',
+    'baseball bat',
+    'basket',
+    'basketball backboard',
+    'bathtub',
+    'bed',
+    'bedclothes',
+    'beer',
+    'bell',
+    'bench',
+    'bicycle',
+    'binoculars',
+    'bird',
+    'bird cage',
+    'bird feeder',
+    'bird nest',
+    'blackboard',
+    'board',
+    'boat',
+    'bone',
+    'book',
+    'bottle',
+    'bottle opener',
+    'bowl',
+    'box',
+    'bracelet',
+    'brick',
+    'bridge',
+    'broom',
+    'brush',
+    'bucket',
+    'building',
+    'bus',
+    'cabinet',
+    'cabinet door',
+    'cage',
+    'cake',
+    'calculator',
+    'calendar',
+    'camel',
+    'camera',
+    'camera lens',
+    'can',
+    'candle',
+    'candle holder',
+    'cap',
+    'car',
+    'card',
+    'cart',
+    'case',
+    'casette recorder',
+    'cash register',
+    'cat',
+    'cd',
+    'cd player',
+    'cell phone',
+    'cello',
+    'chain',
+    'chair',
+    'chessboard',
+    'chicken',
+    'chopstick',
+    'clip',
+    'clippers',
+    'clock',
+    'closet',
+    'cloth',
+    'clothes tree',
+    'coffee',
+    'coffee machine',
+    'comb',
+    'computer',
+    'cone',
+    'container',
+    'control booth',
+    'controller',
+    'cooker',
+    'copying machine',
+    'cork',
+    'corkscrew',
+    'counter',
+    'cow',
+    'crabstick',
+    'crane',
+    'crate',
+    'cross',
+    'crutch',
+    'cup',
+    'curtain',
+    'cushion',
+    'cutting board',
+    'dais',
+    'disc',
+    'disc case',
+    'dishwasher',
+    'dog',
+    'dolphin',
+    'door',
+    'drainer',
+    'dray',
+    'drink dispenser',
+    'drinking machine',
+    'drop',
+    'drug',
+    'drum',
+    'drum kit',
+    'duck',
+    'dumbbell',
+    'earphone',
+    'earrings',
+    'egg',
+    'electric fan',
+    'electric iron',
+    'electric pot',
+    'electric saw',
+    'electronic keyboard',
+    'engine',
+    'envelope',
+    'equipment',
+    'escalator',
+    'exhibition booth',
+    'extinguisher',
+    'eyeglass',
+    'fan',
+    'faucet',
+    'fax machine',
+    'fence',
+    'ferris wheel',
+    'fire extinguisher',
+    'fire hydrant',
+    'fire place',
+    'fish',
+    'fish tank',
+    'fishbowl',
+    'fishing net',
+    'fishing pole',
+    'flag',
+    'flagstaff',
+    'flame',
+    'flashlight',
+    'flower',
+    'fly',
+    'food',
+    'footbridge',
+    'forceps',
+    'fork',
+    'forklift',
+    'fountain',
+    'fox',
+    'frame',
+    'fridge',
+    'frog',
+    'fruit',
+    'funnel',
+    'furnace',
+    'game controller',
+    'game machine',
+    'gas cylinder',
+    'gas hood',
+    'gas stove',
+    'gift box',
+    'glass',
+    'glass marble',
+    'globe',
+    'glove',
+    'goal',
+    'grandstand',
+    'gravestone',
+    'guardrail',
+    'guitar',
+    'gun',
+    'hammer',
+    'hand cart',
+    'handle',
+    'handrail',
+    'hanger',
+    'hard disk drive',
+    'hat',
+    'hay',
+    'headphone',
+    'heater',
+    'helicopter',
+    'helmet',
+    'holder',
+    'hook',
+    'horse',
+    'horse-drawn carriage',
+    'hot-air balloon',
+    'hydrovalve',
+    'inflator pump',
+    'ipod',
+    'iron',
+    'ironing board',
+    'jar',
+    'kart',
+    'kettle',
+    'key',
+    'keyboard',
+    'kitchen range',
+    'kite',
+    'knife',
+    'knife block',
+    'ladder',
+    'ladder truck',
+    'ladle',
+    'laptop',
+    'lid',
+    'life buoy',
+    'light',
+    'light bulb',
+    'lighter',
+    'line',
+    'lion',
+    'lobster',
+    'lock',
+    'machine',
+    'mailbox',
+    'mannequin',
+    'map',
+    'mask',
+    'mat',
+    'match book',
+    'mattress',
+    'menu',
+    'metal',
+    'meter box',
+    'microphone',
+    'microwave',
+    'mirror',
+    'missile',
+    'model',
+    'money',
+    'monkey',
+    'mop',
+    'motorbike',
+    'mouse',
+    'mouse pad',
+    'musical instrument',
+    'napkin',
+    'net',
+    'newspaper',
+    'oar',
+    'ornament',
+    'outlet',
+    'oven',
+    'oxygen bottle',
+    'pack',
+    'pan',
+    'paper',
+    'paper box',
+    'paper cutter',
+    'parachute',
+    'parasol',
+    'pelage',
+    'pen',
+    'pen container',
+    'pencil',
+    'person',
+    'photo',
+    'piano',
+    'picture',
+    'pig',
+    'pillar',
+    'pillow',
+    'pipe',
+    'pitcher',
+    'plant',
+    'plastic',
+    'plate',
+    'platform',
+    'player',
+    'playground',
+    'pliers',
+    'plume',
+    'poker',
+    'poker chip',
+    'pole',
+    'pool table',
+    'postcard',
+    'poster',
+    'pot',
+    'pottedplant',
+    'printer',
+    'projector',
+    'pumpkin',
+    'rabbit',
+    'racket',
+    'radiator',
+    'radio',
+    'rail',
+    'rake',
+    'ramp',
+    'range hood',
+    'receiver',
+    'recorder',
+    'recreational machines',
+    'remote control',
+    'robot',
+    'rocket',
+    'rocking horse',
+    'rope',
+    'ruler',
+    'runway',
+    'saddle',
+    'saw',
+    'scale',
+    'scanner',
+    'scissors',
+    'scoop',
+    'screen',
+    'screwdriver',
+    'sculpture',
+    'scythe',
+    'sewer',
+    'sewing machine',
+    'shed',
+    'sheep',
+    'shell',
+    'shelves',
+    'shoe',
+    'shopping cart',
+    'shovel',
+    'sidecar',
+    'sidewalk',
+    'sign',
+    'signal light',
+    'sink',
+    'skateboard',
+    'ski',
+    'sled',
+    'slippers',
+    'smoke',
+    'snail',
+    'snake',
+    'snowmobiles',
+    'sofa',
+    'spanner',
+    'spatula',
+    'speaker',
+    'speed bump',
+    'spice container',
+    'spoon',
+    'sprayer',
+    'squirrel',
+    'stage',
+    'stair',
+    'stapler',
+    'stick',
+    'sticky note',
+    'stool',
+    'stove',
+    'straw',
+    'stretcher',
+    'sunglass',
+    'sunshade',
+    'surveillance camera',
+    'swan',
+    'sweeper',
+    'swim ring',
+    'swimming pool',
+    'swing',
+    'switch',
+    'table',
+    'tableware',
+    'tank',
+    'tap',
+    'tape',
+    'tarp',
+    'telephone',
+    'telephone booth',
+    'tent',
+    'tire',
+    'toaster',
+    'toilet',
+    'tong',
+    'tool',
+    'toothbrush',
+    'towel',
+    'toy',
+    'toy car',
+    'track',
+    'train',
+    'trampoline',
+    'trash bin',
+    'tray',
+    'tree',
+    'tricycle',
+    'tripod',
+    'trophy',
+    'truck',
+    'tube',
+    'turtle',
+    'tvmonitor',
+    'tweezers',
+    'typewriter',
+    'umbrella',
+    'unknown',
+    'vacuum cleaner',
+    'vending machine',
+    'video camera',
+    'video game console',
+    'video player',
+    'video tape',
+    'violin',
+    'wakeboard',
+    'wallet',
+    'wardrobe',
+    'washing machine',
+    'watch',
+    'water dispenser',
+    'water pipe',
+    'water skate board',
+    'watermelon',
+    'whale',
+    'wharf',
+    'wheel',
+    'wheelchair',
+    'window',
+    'window blinds',
+    'wineglass',
+    'wire',
+    'wool',
+]
+PASCAL_459_STUFF_CLASS_ID = [
+    6, 67, 85, 92, 96, 111, 157, 160, 186, 188, 210, 228, 258, 277, 278, 323,
+    325, 329, 333, 359, 365, 381, 386, 439, 444, 457,
+]
+PASCAL_459_THING_CLASS_ID = [
+    i for i in range(459) if i not in PASCAL_459_STUFF_CLASS_ID
+]
+class Pascal459Dataset(Dataset):
+  """PASCAL 459 dataset."""
+  def __init__(self, root, split='validation', transform=None):
+    super(Pascal459Dataset, self).__init__()
+    self.root = root
+    self.split = split
+    self.transforms = transform
+    self.image_dir = os.path.join(root, 'images', split)
+    self.mask_dir = os.path.join(root, 'annotations_ctx459', split)
+    self.images = os.listdir(self.image_dir)
+  def __getitem__(self, index):
+    image_path = os.path.join(self.image_dir, self.images[index])
+    image = Image.open(image_path).convert('RGB')
+    target = (
+        np.asarray(
+            Image.open(
+                os.path.join(
+                    self.mask_dir, self.images[index].replace('jpg', 'tif')
+                )
+            ),
+            dtype=np.int32,
+        )
+        + 1
+    )
+    if self.transforms:
+      image = self.transforms(image)
+    return image, image_path, target, index
+  def __len__(self):
+    return len(self.images)

data/preprocess.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocess for referring datasets.
+Adapted from
+https://github.com/yz93/LAVT-RIS/blob/main/data/dataset_refer_bert.py
+"""
+# pylint: disable=all
+from refer.refer import REFER
+from torch.utils import data
+class ReferDataset(data.Dataset):
+  """Refer dataset."""
+  def __init__(
+      self,
+      root,
+      dataset='refcoco',
+      splitBy='unc',
+      image_transforms=None,
+      target_transforms=None,
+      split='train',
+      eval_mode=False,
+  ):
+    self.classes = []
+    self.image_transforms = image_transforms
+    self.target_transforms = target_transforms
+    self.split = split
+    self.refer = REFER(root, dataset=dataset, splitBy=splitBy)
+    ref_ids = self.refer.getRefIds(split=self.split)
+    img_ids = self.refer.getImgIds(ref_ids)
+    all_imgs = self.refer.Imgs
+    self.imgs = list(all_imgs[i] for i in img_ids)
+    self.ref_ids = ref_ids
+    print(len(ref_ids))
+    print(len(self.imgs))
+    # print(self.imgs)
+    self.sentence_raw = []
+    self.eval_mode = eval_mode
+    # if we are testing on a dataset, test all sentences of an object;
+    # o/w, we are validating during training, randomly sample one sentence for
+    # efficiency
+    for r in ref_ids:
+      ref = self.refer.Refs[r]
+      ref_sentences = []
+      for el, _ in zip(ref['sentences'], ref['sent_ids']):
+        sentence_raw = el['raw']
+        ref_sentences.append(sentence_raw)
+      self.sentence_raw.append(ref_sentences)
+    # print(len(self.sentence_raw))
+  def get_classes(self):
+    return self.classes
+  def __len__(self):
+    return len(self.imgs)
+  def __getitem__(self, index):
+    this_img_id = self.imgs[index]['id']
+    this_ref_ids = self.refer.getRefIds(this_img_id)
+    this_img = self.refer.Imgs[this_img_id]
+    refs = [self.refer.loadRefs(this_ref_id) for this_ref_id in this_ref_ids]
+    batch_sentences = {}
+    # batch_targets = {}
+    for ref in refs:
+      # Get sentence
+      sentence_lis = []
+      for el, _ in zip(ref[0]['sentences'], ref[0]['sent_ids']):
+        sentence_raw = el['raw']
+        sentence_lis.append(sentence_raw)
+      batch_sentences.update({ref[0]['ref_id']: sentence_lis})
+    return [this_img['file_name']], batch_sentences
+  def get_ref(self):
+    name_lis = []
+    for i in range(len(self.ref_ids)):
+      rid = self.ref_ids[i]
+      # print(rid)
+      ref = self.refer.loadRefs(rid)
+      if ref[0]['file_name'] == '':
+        print(1)
+      # print(ref[0]['file_name'])
+      # if ref[0]['file_name'] in name_lis:
+      #     print("md")
+      name_lis.append(ref[0]['file_name'])
+      print(ref[0]['file_name'])
+    # print(name_lis)
+    print(len(name_lis))
+    print(len(list(set(name_lis))))

data/refcoco.py ADDED Viewed

	@@ -0,0 +1,449 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RefCOCO dataset."""
+# Adapted from
+# https://github.com/yz93/LAVT-RIS/blob/main/data/dataset_refer_bert.py
+# pylint: disable=all
+import itertools
+import json
+import os
+import os.path as osp
+import pickle as pickle
+import sys
+import time
+# pylint: disable=g-importing-member
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+from matplotlib.patches import Rectangle
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from pycocotools import mask
+import skimage.io as io
+import torch
+import torch.utils.data as data
+from torchvision import transforms
+class REFER:
+  """RefCOCO dataset."""
+  def __init__(self, data_root, dataset='refcoco', splitBy='unc', split='val'):
+    # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+    # also provide dataset name and splitBy information
+    # e.g., dataset = 'refcoco', splitBy = 'unc'
+    print('loading dataset %s into memory...' % dataset)
+    if dataset == 'refcocog':
+      print('Split by {}!'.format(splitBy))
+    self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+    self.DATA_DIR = osp.join(data_root, dataset)
+    if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+      self.IMAGE_DIR = osp.join(data_root, 'images/mscoco/images/train2014')
+    elif dataset == 'refclef':
+      self.IMAGE_DIR = osp.join(data_root, 'images/saiapr_tc-12')
+    else:
+      print('No refer dataset is called [%s]' % dataset)
+      sys.exit()
+    # load refs from data/dataset/refs(dataset).json
+    tic = time.time()
+    ref_file = osp.join(self.DATA_DIR, 'refs(' + splitBy + ').p')
+    self.data = {}
+    self.data['dataset'] = dataset
+    # f = open(ref_file, 'r')
+    self.data['refs'] = pickle.load(open(ref_file, 'rb'))
+    # load annotations from data/dataset/instances.json
+    instances_file = osp.join(self.DATA_DIR, 'instances.json')
+    instances = json.load(open(instances_file, 'r'))
+    self.data['images'] = instances['images']
+    self.data['annotations'] = instances['annotations']
+    self.data['categories'] = instances['categories']
+    # create index
+    self.createIndex()
+    self.split = split
+    print('DONE (t=%.2fs)' % (time.time() - tic))
+  def createIndex(self):
+    # create sets of mapping
+    # 1)  Refs: 	 	{ref_id: ref}
+    # 2)  Anns: 	 	{ann_id: ann}
+    # 3)  Imgs:		 	{image_id: image}
+    # 4)  Cats: 	 	{category_id: category_name}
+    # 5)  Sents:     	{sent_id: sent}
+    # 6)  imgToRefs: 	{image_id: refs}
+    # 7)  imgToAnns: 	{image_id: anns}
+    # 8)  refToAnn:  	{ref_id: ann}
+    # 9)  annToRef:  	{ann_id: ref}
+    # 10) catToRefs: 	{category_id: refs}
+    # 11) sentToRef: 	{sent_id: ref}
+    # 12) sentToTokens: {sent_id: tokens}
+    print('creating index...')
+    # fetch info from instances
+    Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+    for ann in self.data['annotations']:
+      Anns[ann['id']] = ann
+      imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
+    for img in self.data['images']:
+      Imgs[img['id']] = img
+    for cat in self.data['categories']:
+      Cats[cat['id']] = cat['name']
+    # fetch info from refs
+    Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+    Sents, sentToRef, sentToTokens = {}, {}, {}
+    for ref in self.data['refs']:
+      # ids
+      ref_id = ref['ref_id']
+      ann_id = ref['ann_id']
+      category_id = ref['category_id']
+      image_id = ref['image_id']
+      # add mapping related to ref
+      Refs[ref_id] = ref
+      imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+      catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+      refToAnn[ref_id] = Anns[ann_id]
+      annToRef[ann_id] = ref
+      # add mapping of sent
+      for sent in ref['sentences']:
+        Sents[sent['sent_id']] = sent
+        sentToRef[sent['sent_id']] = ref
+        sentToTokens[sent['sent_id']] = sent['tokens']
+    # create class members
+    self.Refs = Refs
+    self.Anns = Anns
+    self.Imgs = Imgs
+    self.Cats = Cats
+    self.Sents = Sents
+    self.imgToRefs = imgToRefs
+    self.imgToAnns = imgToAnns
+    self.refToAnn = refToAnn
+    self.annToRef = annToRef
+    self.catToRefs = catToRefs
+    self.sentToRef = sentToRef
+    self.sentToTokens = sentToTokens
+    print('index created.')
+  def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+    image_ids = image_ids if type(image_ids) == list else [image_ids]
+    cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+    ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+    if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+      refs = self.data['refs']
+    else:
+      if not len(image_ids) == 0:
+        refs = [self.imgToRefs[image_id] for image_id in image_ids]
+        ref_ids = []
+        for img_ref in refs:
+          ref_ids.extend([ref['ref_id'] for ref in img_ref])
+        return ref_ids
+      else:
+        refs = self.data['refs']
+      if not len(cat_ids) == 0:
+        refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+      if not len(ref_ids) == 0:
+        refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+      if not len(split) == 0:
+        if split in ['testA', 'testB', 'testC']:
+          # we also consider testAB, testBC, ...
+          refs = [ref for ref in refs if split[-1] in ref['split']]
+        elif split in ['testAB', 'testBC', 'testAC']:
+          # rarely used I guess...
+          refs = [ref for ref in refs if ref['split'] == split]
+        elif split == 'test':
+          refs = [ref for ref in refs if 'test' in ref['split']]
+        elif split == 'train' or split == 'val':
+          refs = [ref for ref in refs if ref['split'] == split]
+        else:
+          print('No such split [%s]' % split)
+          sys.exit()
+    ref_ids = [ref['ref_id'] for ref in refs]
+    return ref_ids
+  def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+    image_ids = image_ids if type(image_ids) == list else [image_ids]
+    cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+    ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+    if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+      ann_ids = [ann['id'] for ann in self.data['annotations']]
+    else:
+      if not len(image_ids) == 0:
+        lists = [
+            self.imgToAnns[image_id]
+            for image_id in image_ids
+            if image_id in self.imgToAnns
+        ]  # list of [anns]
+        anns = list(itertools.chain.from_iterable(lists))
+      else:
+        anns = self.data['annotations']
+      if not len(cat_ids) == 0:
+        anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+      ann_ids = [ann['id'] for ann in anns]
+      # if not len(ref_ids) == 0:
+      #   ids = set(ann_ids).intersection(
+      #       set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids])
+      #   )
+    return ann_ids
+  def getImgIds(self, ref_ids=[]):
+    ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+    if not len(ref_ids) == 0:
+      image_ids = list(
+          set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids])
+      )
+    else:
+      image_ids = self.Imgs.keys()
+    return image_ids
+  def getCatIds(self):
+    return self.Cats.keys()
+  def loadRefs(self, ref_ids=[]):
+    if type(ref_ids) == list:
+      return [self.Refs[ref_id] for ref_id in ref_ids]
+    elif type(ref_ids) == int:
+      return [self.Refs[ref_ids]]
+  def loadAnns(self, ann_ids=[]):
+    if type(ann_ids) == list:
+      return [self.Anns[ann_id] for ann_id in ann_ids]
+    elif type(ann_ids) == int or type(ann_ids) == unicode:
+      return [self.Anns[ann_ids]]
+  def loadImgs(self, image_ids=[]):
+    if type(image_ids) == list:
+      return [self.Imgs[image_id] for image_id in image_ids]
+    elif type(image_ids) == int:
+      return [self.Imgs[image_ids]]
+  def loadCats(self, cat_ids=[]):
+    if type(cat_ids) == list:
+      return [self.Cats[cat_id] for cat_id in cat_ids]
+    elif type(cat_ids) == int:
+      return [self.Cats[cat_ids]]
+  def getRefBox(self, ref_id):
+    # ref = self.Refs[ref_id]
+    ann = self.refToAnn[ref_id]
+    return ann['bbox']  # [x, y, w, h]
+  def showRef(self, ref, seg_box='seg'):
+    ax = plt.gca()
+    # show image
+    image = self.Imgs[ref['image_id']]
+    I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+    ax.imshow(I)
+    # show refer expression
+    for sid, sent in enumerate(ref['sentences']):
+      print('%s. %s' % (sid + 1, sent['sent']))
+    # show segmentations
+    if seg_box == 'seg':
+      ann_id = ref['ann_id']
+      ann = self.Anns[ann_id]
+      polygons = []
+      color = []
+      c = 'none'
+      if type(ann['segmentation'][0]) == list:
+        # polygon used for refcoco*
+        for seg in ann['segmentation']:
+          poly = np.array(seg).reshape((len(seg) / 2, 2))
+          polygons.append(Polygon(poly, True, alpha=0.4))
+          color.append(c)
+        p = PatchCollection(
+            polygons,
+            facecolors=color,
+            edgecolors=(1, 1, 0, 0),
+            linewidths=3,
+            alpha=1,
+        )
+        ax.add_collection(p)  # thick yellow polygon
+        p = PatchCollection(
+            polygons,
+            facecolors=color,
+            edgecolors=(1, 0, 0, 0),
+            linewidths=1,
+            alpha=1,
+        )
+        ax.add_collection(p)  # thin red polygon
+      else:
+        # mask used for refclef
+        rle = ann['segmentation']
+        m = mask.decode(rle)
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.array([2.0, 166.0, 101.0]) / 255
+        for i in range(3):
+          img[:, :, i] = color_mask[i]
+        ax.imshow(np.dstack((img, m * 0.5)))
+    # show bounding-box
+    elif seg_box == 'box':
+      # ann_id = ref['ann_id']
+      # ann = self.Anns[ann_id]
+      bbox = self.getRefBox(ref['ref_id'])
+      box_plot = Rectangle(
+          (bbox[0], bbox[1]),
+          bbox[2],
+          bbox[3],
+          fill=False,
+          edgecolor='green',
+          linewidth=3,
+      )
+      ax.add_patch(box_plot)
+  def getMask(self, ref):
+    # return mask, area and mask-center
+    ann = self.refToAnn[ref['ref_id']]
+    image = self.Imgs[ref['image_id']]
+    if type(ann['segmentation'][0]) == list:  # polygon
+      rle = mask.frPyObjects(
+          ann['segmentation'], image['height'], image['width']
+      )
+    else:
+      rle = ann['segmentation']
+    m = mask.decode(rle)
+    # sometimes there are multiple binary map (corresponding to multiple segs)
+    m = np.sum(m, axis=2)
+    m = m.astype(np.uint8)  # convert to np.uint8
+    # compute area
+    area = sum(mask.area(rle))  # should be close to ann['area']
+    return {'mask': m, 'area': area}
+  def showMask(self, ref):
+    M = self.getMask(ref)
+    msk = M['mask']
+    ax = plt.gca()
+    ax.imshow(msk)
+class ReferDataset(data.Dataset):
+  def __init__(
+      self,
+      root,
+      dataset='refcoco',
+      splitBy='google',
+      image_transforms=None,
+      target_transforms=None,
+      split='train',
+      eval_mode=False,
+  ):
+    self.classes = []
+    self.image_transforms = image_transforms
+    self.target_transforms = target_transforms
+    self.split = split
+    self.refer = REFER(root, dataset=dataset, splitBy=splitBy)
+    ref_ids = self.refer.getRefIds(split=self.split)
+    img_ids = self.refer.getImgIds(ref_ids)
+    all_imgs = self.refer.Imgs
+    self.imgs = list(all_imgs[i] for i in img_ids)
+    self.ref_ids = ref_ids
+    # print(len(ref_ids))
+    # print(len(self.imgs))
+    self.sentence_raw = []
+    self.eval_mode = eval_mode
+    # if we are testing on a dataset, test all sentences of an object;
+    # o/w, we are validating during training, randomly sample one sentence
+    # for efficiency
+    for r in ref_ids:
+      ref = self.refer.Refs[r]
+      # ref_sentences = []
+      # for i, (el, sent_id) in enumerate(zip(ref['sentences'],
+      #                                       ref['sent_ids'])):
+      for el in ref['sentences']:
+        sentence_raw = el['raw']
+        ref_sentences.append(sentence_raw)
+      self.sentence_raw.append(ref_sentences)
+    # print(len(self.sentence_raw))
+  def get_classes(self):
+    return self.classes
+  def __len__(self):
+    return len(self.ref_ids)
+  def __getitem__(self, index):
+    this_ref_id = self.ref_ids[index]
+    this_img_id = self.refer.getImgIds(this_ref_id)
+    this_img = self.refer.Imgs[this_img_id[0]]
+    # print(this_ref_id, this_img_id)
+    # print(len(self.ref_ids))
+    img_path = os.path.join(self.refer.IMAGE_DIR, this_img['file_name'])
+    img = Image.open(img_path).convert('RGB')
+    ref = self.refer.loadRefs(this_ref_id)
+    # print("ref",ref)
+    ref_mask = np.array(self.refer.getMask(ref[0])['mask'])
+    annot = np.zeros(ref_mask.shape)
+    annot[ref_mask == 1] = 1
+    target = Image.fromarray(annot.astype(np.uint8), mode='P')
+    # print(np.array(target), np.unique(np.array(target).flatten()))
+    if self.image_transforms is not None:
+      # resize, from PIL to tensor, and mean and std normalization
+      img = self.image_transforms(img)
+      # target = self.target_transforms(target)
+      target = torch.as_tensor(np.array(target, copy=True))
+      # target = target.permute((2, 0, 1))
+    sentence = self.sentence_raw[index]
+    return img, img_path, target, sentence
+if __name__ == '__main__':
+  def get_transform():
+    transform = [
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        # T.Normalize(mean=[0.485, 0.456, 0.406],
+        # std=[0.229, 0.224, 0.225])
+    ]
+    return transforms.Compose(transform)
+  transform = get_transform()
+  dataset_test = ReferDataset(
+      root='/datasets/refseg',
+      dataset='refcoco+',
+      splitBy='google',
+      image_transforms=transform,
+      target_transforms=transform,
+      split='train',
+      eval_mode=False,
+  )
+  print('loaded')
+  test_sampler = torch.utils.data.SequentialSampler(dataset_test)
+  data_loader_test = torch.utils.data.DataLoader(
+      dataset_test, batch_size=1, sampler=test_sampler, num_workers=1
+  )
+  for img, target, sentence in data_loader_test:
+    # print(type(img),type(target))
+    print(sentence)
+    break

data/voc.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pascal VOC dataset."""
+import numpy as np
+from PIL import Image
+# pylint: disable=g-importing-member
+from torchvision.datasets import VOCSegmentation
+CLASS2ID = {
+    'Background': 0,
+    'Aero plane': 1,
+    'Bicycle': 2,
+    'Bird': 3,
+    'Boat': 4,
+    'Bottle': 5,
+    'Bus': 6,
+    'Car': 7,
+    'Cat': 8,
+    'Chair': 9,
+    'Cow': 10,
+    'Dining table': 11,
+    'Dog': 12,
+    'Horse': 13,
+    'Motorbike': 14,
+    'Person': 15,
+    'Potted plant': 16,
+    'Sheep': 17,
+    'Sofa': 18,
+    'Train': 19,
+    'Tv/Monitor': 20,
+    # ... add more entries as needed
+    'Border': 255,
+}
+VOC_CLASSES = [
+    'aeroplane',
+    'bicycle',
+    'bird avian',
+    'boat',
+    'bottle',
+    'bus',
+    'car',
+    'cat',
+    'chair seat',
+    'cow',
+    'diningtable',
+    'dog',
+    'horse',
+    'motorbike',
+    'person with clothes,people,human',
+    'pottedplant',
+    'sheep',
+    'sofa',
+    'train',
+    'tvmonitor screen',
+]
+BACKGROUND_CATEGORY = [
+    'ground',
+    'land',
+    'grass',
+    'tree',
+    'building',
+    'wall',
+    'sky',
+    'lake',
+    'water',
+    'river',
+    'sea',
+    'keyboard',
+    'helmet',
+    'cloud',
+    'house',
+    'mountain',
+    'ocean',
+    'road',
+    'rock',
+    'street',
+    'valley',
+    'bridge',
+    'sign',
+]
+class VOCDataset(VOCSegmentation):
+  """Pascal VOC dataset."""
+  def __init__(
+      self,
+      root='/datasets/jianhaoy/PASCAL/',
+      year='2012',
+      split='val',
+      target_transform=None,
+      download=False,
+      transform=None,
+  ):
+    super(VOCDataset, self).__init__(
+        root=root,
+        image_set=split,
+        year=year,
+        target_transform=transform,
+        download=download,
+        transform=transform,
+    )
+    self.idx_to_class = {val: key for (key, val) in CLASS2ID.items()}
+  def __getitem__(self, index):
+    image_path = self.images[index]
+    image = Image.open(image_path).convert('RGB')
+    target = np.asarray(Image.open(self.masks[index]), dtype=np.int32)
+    _, unique_values = self.process_target(np.array(target))
+    classnames = [self.idx_to_class[idx] for idx in unique_values]
+    if self.transforms:
+      image = self.transform(image)
+    return image, str(image_path), target, classnames
+  def process_target(self, arr):
+    # Set values 0 and 255 to 1
+    arr[(arr == 0) | (arr == 255)] = 0
+    # Find unique values (excluding 0 and 255)
+    unique_values = np.unique(arr[(arr != 0) & (arr != 255)])
+    # Create separate masks for each unique value
+    masks = [arr == value for value in unique_values]
+    masks = [Image.fromarray(arr) for arr in masks]
+    masks = [self.target_transform(arr) for arr in masks]
+    return masks, unique_values

demo.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""Run a demo of the CaR model on a single image."""
+import numpy as np
+import os
+import argparse
+from functools import reduce
+import PIL.Image as Image
+import torch
+from modeling.model import CaR
+from utils.utils import Config, load_yaml
+import matplotlib.pyplot as plt
+import colorsys
+from modeling.post_process.post_process import (
+    match_masks,
+    generate_masks_from_sam,
+)
+from sam.sam import SAMPipeline
+from sam.utils import build_sam_config
+import random
+import time
+def generate_distinct_colors(n):
+    colors = []
+    # generate a random number from 0 to 1
+    random_color_bias = random.random()
+    for i in range(n):
+        hue = float(i) / n
+        hue += random_color_bias
+        hue = hue % 1.0
+        rgb = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+        # Convert RGB values from [0, 1] range to [0, 255]
+        colors.append(tuple(int(val * 255) for val in rgb))
+    return colors
+def overlap_masks(masks):
+    """
+    Overlap masks to generate a single mask for visualization.
+    Parameters:
+        - masks: list of np.arrays of shape (H, W) representing binary masks
+          for each class.
+    Returns:
+    - overlap_mask: list of np.array of shape (H, W) that have no overlaps
+    """
+    overlap_mask = torch.zeros_like(masks[0])
+    for mask_idx, mask in enumerate(masks):
+        overlap_mask[mask > 0] = mask_idx + 1
+    clean_masks = [
+        overlap_mask == mask_idx + 1 for mask_idx in range(len(masks))
+    ]
+    clean_masks = torch.stack(clean_masks, dim=0)
+    return clean_masks
+def visualize_segmentation(
+    image, masks, class_names, alpha=0.45, y_list=None, x_list=None
+):
+    """
+    Visualize segmentation masks on an image.
+    Parameters:
+        - image: np.array of shape (H, W, 3) representing the RGB image
+        - masks: list of np.arrays of shape (H, W) representing binary masks
+          for each class.
+        - class_names: list of strings representing names of each class
+        - alpha: float, transparency level of masks on the image
+    Returns:
+    - visualization: plt.figure object
+    """
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(12, 9))
+    # Display the image
+    # ax.imshow(image)
+    # Generate distinct colors for each mask
+    final_mask = np.zeros(
+        (masks.shape[1], masks.shape[2], 3), dtype=np.float32
+    )
+    colors = generate_distinct_colors(len(class_names))
+    idx = 0
+    for mask, color, class_name in zip(masks, colors, class_names):
+        # Overlay the mask
+        final_mask += np.dstack([mask * c for c in color])
+        # Find a representative point (e.g., centroid) for placing the label
+        if y_list is None or x_list is None:
+            y, x = np.argwhere(mask).mean(axis=0)
+        else:
+            y, x = y_list[idx], x_list[idx]
+        ax.text(
+            x,
+            y,
+            class_name,
+            color="white",
+            fontsize=36,
+            va="center",
+            ha="center",
+            bbox=dict(facecolor="black", alpha=0.7, edgecolor="none"),
+        )
+        idx += 1
+    final_image = image * (1 - alpha) + final_mask * alpha
+    final_image = final_image.astype(np.uint8)
+    ax.imshow(final_image)
+    # Remove axis ticks and labels
+    ax.axis("off")
+    return fig
+def get_sam_masks(config, image_path, masks, img_sam=None, pipeline=None):
+    print("generating sam masks online")
+    mask_tensor, mask_list = generate_masks_from_sam(
+        image_path,
+        save_path="./",
+        pipeline=pipeline,
+        img_sam=img_sam,
+        visualize=False,
+    )
+    mask_tensor = mask_tensor.to(masks.device)
+    # only conduct sam on masks that is not all zero
+    attn_map, mask_ids = [], []
+    for mask_id, mask in enumerate(masks):
+        if torch.sum(mask) > 0:
+            attn_map.append(mask.unsqueeze(0))
+            mask_ids.append(mask_id)
+    matched_masks = [
+        match_masks(
+            mask_tensor,
+            attn,
+            mask_list,
+            iom_thres=config.car.iom_thres,
+            min_pred_threshold=config.sam.min_pred_threshold,
+        )
+        for attn in attn_map
+    ]
+    for matched_mask, mask_id in zip(matched_masks, mask_ids):
+        sam_masks = np.array([item["segmentation"] for item in matched_mask])
+        sam_mask = np.any(sam_masks, axis=0)
+        masks[mask_id] = torch.from_numpy(sam_mask).to(masks.device)
+    return masks
+def load_sam(config, sam_device):
+    sam_checkpoint, model_type = build_sam_config(config)
+    pipelines = SAMPipeline(
+        sam_checkpoint,
+        model_type,
+        device=sam_device,
+        points_per_side=config.sam.points_per_side,
+        pred_iou_thresh=config.sam.pred_iou_thresh,
+        stability_score_thresh=config.sam.stability_score_thresh,
+        box_nms_thresh=config.sam.box_nms_thresh,
+    )
+    return pipelines
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("CaR")
+    # default arguments
+    # additional arguments
+    parser.add_argument(
+        "--output_path", type=str, default="", help="path to save outputs"
+    )
+    parser.add_argument(
+        "--cfg-path",
+        default="configs/voc_test.yaml",
+        help="path to configuration file.",
+    )
+    args = parser.parse_args()
+    cfg = Config(**load_yaml(args.cfg_path))
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # device = 'cpu'
+    folder_name = reduce(
+        lambda x, y: x.replace(" ", "_") + "_" + y, cfg.image_caption
+    )
+    if len(folder_name) > 20:
+        folder_name = folder_name[:20]
+    car_model = CaR(
+        cfg, visualize=True, seg_mode=cfg.test.seg_mode, device=device
+    )
+    sam_pipeline = load_sam(cfg, device)
+    img = Image.open(cfg.image_path).convert("RGB")
+    import pdb; pdb.set_trace()
+    # resize image by dividing 2 if the size is larger than 1000
+    if img.size[0] > 1000:
+        img = img.resize((img.size[0] // 3, img.size[1] // 3))
+    label_space = cfg.image_caption
+    pseudo_masks, scores, _ = car_model(img, label_space)
+    if not cfg.test.use_pseudo:
+        t1 = time.time()
+        pseudo_masks = get_sam_masks(
+            cfg,
+            cfg.image_path,
+            pseudo_masks,
+            img_sam=np.array(img),
+            pipeline=sam_pipeline,
+        )
+        pseudo_masks = overlap_masks(pseudo_masks)
+        t2 = time.time()
+        print(f"sam time: {t2 - t1}")
+    # visualize segmentation masks
+    demo_fig = visualize_segmentation(
+        np.array(img),
+        pseudo_masks.detach().cpu().numpy(),
+        label_space,
+    )
+    save_path = f"vis_results/{folder_name}"
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    demo_fig.savefig(os.path.join(save_path, "demo.png"), bbox_inches="tight")
+    print(f"results saved to {save_path}.")

evaluate.py ADDED Viewed

	@@ -0,0 +1,511 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluate CaR on segmentation benchmarks."""
+# pylint: disable=g-importing-member
+import argparse
+import numpy as np
+import torch
+from torch.utils import tensorboard
+import torch.utils.data
+from torch.utils.data import Subset
+import torchvision.transforms as T
+# pylint: disable=g-bad-import-order
+from modeling.model.car import CaR
+from sam.utils import build_sam_config
+from utils.utils import Config
+from utils.utils import load_yaml
+from utils.utils import MetricLogger
+from utils.utils import SmoothedValue
+from utils.inference_pipeline import inference_car
+from utils.merge_mask import merge_masks_simple
+# Datasets
+# pylint: disable=g-multiple-import
+from data.ade import ADE_THING_CLASS, ADE_STUFF_CLASS, ADE_THING_CLASS_ID, ADE_STUFF_CLASS_ID, ADEDataset
+from data.ade847 import ADE_847_THING_CLASS_ID, ADE_847_STUFF_CLASS_ID, ADE_847_THING_CLASS, ADE_847_STUFF_CLASS, ADE847Dataset
+from data.coco import COCO_OBJECT_CLASSES, COCODataset
+from data.context import PASCAL_CONTEXT_STUFF_CLASS_ID, PASCAL_CONTEXT_THING_CLASS_ID, PASCAL_CONTEXT_STUFF_CLASS, PASCAL_CONTEXT_THING_CLASS, CONTEXTDataset
+from data.gres import GReferDataset
+from data.pascal459 import PASCAL_459_THING_CLASS_ID, PASCAL_459_STUFF_CLASS_ID, PASCAL_459_THING_CLASS, PASCAL_459_STUFF_CLASS, Pascal459Dataset
+from data.refcoco import ReferDataset
+from data.voc import VOC_CLASSES, VOCDataset
+IMAGE_WIDTH, IMAGE_HEIGHT = 512, 512
+# set random seed
+torch.manual_seed(0)
+np.random.seed(0)
+def get_dataset(cfg, ds_name, split, transform, data_root=None):
+  """Get dataset."""
+  data_args = dict(root=data_root) if data_root is not None else {}
+  if 'refcoco' in ds_name:
+    splitby = cfg.test.splitby if hasattr(cfg.test, 'splitby') else 'unc'
+    ds = ReferDataset(
+        dataset=ds_name,
+        splitBy=splitby,
+        split=split,
+        image_transforms=transform,
+        target_transforms=transform,
+        eval_mode=True,
+        prompts_augment=cfg.test.prompts_augment,
+        **data_args,
+    )
+  elif ds_name == 'gres':
+    ds = GReferDataset(split=split, transform=transform, **data_args)
+  elif ds_name == 'voc':
+    ds = VOCDataset(
+        year='2012',
+        split=split,
+        transform=transform,
+        target_transform=transform,
+        **data_args,
+    )
+  elif ds_name == 'cocostuff':
+    ds = COCODataset(transform=transform, **data_args)
+  elif ds_name == 'context':
+    ds = CONTEXTDataset(
+        year='2010', transform=transform, split=split, **data_args
+    )
+  elif ds_name == 'ade':
+    ds = ADEDataset(split=split, transform=transform, **data_args)
+  elif ds_name == 'pascal_459':
+    ds = Pascal459Dataset(split=split, transform=transform, **data_args)
+  elif ds_name == 'ade_847':
+    ds = ADE847Dataset(split=split, transform=transform, **data_args)
+  else:
+    raise ValueError(f'Dataset {ds_name} not implemented')
+  return ds
+def get_transform():
+  transforms = [
+      T.Resize((IMAGE_WIDTH, IMAGE_HEIGHT)),
+      T.ToTensor(),
+  ]
+  return T.Compose(transforms)
+def assign_label(
+    all_masks,
+    scores,
+    stuff_masks=None,
+    stuff_scores=None,
+    id_mapping=None,
+    stuff_id_mapping=None,
+):
+  """Assign labels."""
+  label_preds = np.zeros_like(all_masks[0]).astype(np.int32)
+  if stuff_masks is not None:
+    sorted_idxs = np.argsort(stuff_scores.detach().cpu().numpy())
+    stuff_masks = stuff_masks[sorted_idxs]
+    stuff_scores = stuff_scores.detach().cpu().numpy()[sorted_idxs]
+    for sorted_idx, mask, score in zip(sorted_idxs, stuff_masks, stuff_scores):
+      if score > 0:
+        # convert mask to boolean
+        mask = mask > 0.5
+        # assign label
+        if stuff_id_mapping is not None:
+          label_preds[mask] = stuff_id_mapping[sorted_idx] + 1
+        else:
+          label_preds[mask] = sorted_idx + 1
+  sorted_idxs = np.argsort(scores.detach().cpu().numpy())
+  all_masks = all_masks[sorted_idxs]
+  scores = scores.detach().cpu().numpy()[sorted_idxs]
+  for sorted_idx, mask, score in zip(sorted_idxs, all_masks, scores):
+    if score > 0:
+      # convert mask to boolean
+      mask = mask > 0.5
+      # assign label
+      if id_mapping is not None:
+        label_preds[mask] = id_mapping[sorted_idx] + 1
+      else:
+        label_preds[mask] = sorted_idx + 1
+  return label_preds
+def eval_semantic(
+    label_space,
+    algo,
+    cfg,
+    model,
+    image_path,
+    stuff_label_space=None,
+    sam_pipeline=None,
+):
+  """Semantic segmentation evaluation."""
+  if label_space is None:
+    raise ValueError(
+        'label_space must be provided for semantic segmentation evaluation'
+    )
+  if algo == 'car':
+    all_masks, scores = inference_car(
+        cfg, model, image_path, label_space, sam_pipeline=sam_pipeline
+    )
+    if stuff_label_space is not None:
+      if cfg.test.ds_name == 'context':
+        thing_id_mapping = PASCAL_CONTEXT_THING_CLASS_ID
+        stuff_id_mapping = PASCAL_CONTEXT_STUFF_CLASS_ID
+      elif cfg.test.ds_name == 'ade':
+        thing_id_mapping = ADE_THING_CLASS_ID
+        stuff_id_mapping = ADE_STUFF_CLASS_ID
+      elif cfg.test.ds_name == 'pascal_459':
+        thing_id_mapping = PASCAL_459_THING_CLASS_ID
+        stuff_id_mapping = PASCAL_459_STUFF_CLASS_ID
+      elif cfg.test.ds_name == 'ade_847':
+        thing_id_mapping = ADE_847_THING_CLASS_ID
+        stuff_id_mapping = ADE_847_STUFF_CLASS_ID
+      else:
+        raise ValueError(f'Dataset {cfg.test.ds_name} not supported')
+      model.mask_generator.set_bg_cls(label_space)
+      model.set_visual_prompt_type(cfg.car.stuff_visual_prompt_type)
+      model.set_bg_factor(cfg.car.stuff_bg_factor)
+      stuff_masks, stuff_scores = inference_car(
+          cfg, model, image_path, stuff_label_space, sam_pipeline=sam_pipeline
+      )
+      model.mask_generator.set_bg_cls(cfg.car.bg_cls)
+      model.set_visual_prompt_type(cfg.car.visual_prompt_type)
+      model.set_bg_factor(cfg.car.bg_factor)
+      all_masks = all_masks.detach().cpu().numpy()
+      stuff_masks = stuff_masks.detach().cpu().numpy()
+      label_preds = assign_label(
+          all_masks,
+          scores,
+          stuff_masks=stuff_masks,
+          stuff_scores=stuff_scores,
+          id_mapping=thing_id_mapping,
+          stuff_id_mapping=stuff_id_mapping,
+      )
+    else:
+      all_masks = all_masks.detach().cpu().numpy()
+      label_preds = assign_label(all_masks, scores)
+    return label_preds.squeeze()
+  else:
+    raise NotImplementedError(f'algo {algo} not implemented')
+def _fast_hist(label_true, label_pred, n_class=21):
+  mask = (label_true >= 0) & (label_true < n_class)
+  hist = np.bincount(
+      n_class * label_true[mask].astype(int) + label_pred[mask],
+      minlength=n_class**2,
+  ).reshape(n_class, n_class)
+  return hist
+def semantic_iou(label_trues, label_preds, n_class=21, ignore_background=False):
+  """Semantic segmentation IOU."""
+  hist = np.zeros((n_class, n_class))
+  for lt, lp in zip(label_trues, label_preds):
+    hist += _fast_hist(lt.flatten(), lp.flatten(), n_class)
+  if ignore_background:
+    hist = hist[1:, 1:]
+  acc = np.diag(hist).sum() / hist.sum()
+  acc_cls = np.diag(hist) / hist.sum(axis=1)
+  acc_cls = np.nanmean(acc_cls)
+  iu = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist))
+  valid = hist.sum(axis=1) > 0  # added
+  if valid.sum() == 0:
+    mean_iu = 0
+  else:
+    mean_iu = np.nanmean(iu[valid])
+  freq = hist.sum(axis=1) / hist.sum()
+  fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()
+  if ignore_background:
+    cls_iu = dict(zip(range(1, n_class), iu))
+  else:
+    cls_iu = dict(zip(range(n_class), iu))
+  return {
+      'Pixel Accuracy': acc,
+      'Mean Accuracy': acc_cls,
+      'Frequency Weighted IoU': fwavacc,
+      'mIoU': mean_iu,
+      'Class IoU': cls_iu,
+  }
+def evaluate(
+    data_loader,
+    cfg,
+    model,
+    test_cfg,
+    label_space=None,
+    stuff_label_space=None,
+    sam_pipeline=None,
+):
+  """Run evaluation."""
+  if (
+      test_cfg.ds_name
+      not in ['voc', 'cocostuff', 'context', 'ade', 'pascal_459', 'ade_847']
+      and test_cfg.seg_mode == 'semantic'
+  ):
+    raise ValueError((
+        'Semantic segmentation evaluation is only implemented for voc, '
+        'context, coco object, ade, pascal459, ade847 dataset'
+    ))
+  metric_logger = MetricLogger(delimiter='  ')
+  metric_logger.add_meter(
+      'mIoU', SmoothedValue(window_size=1, fmt='{value:.4f} ({global_avg:.4f})')
+  )
+  # evaluation variables
+  cum_i, cum_u = 0, 0
+  eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9]
+  seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
+  seg_total = 0
+  mean_iou = []
+  header = 'Test:'
+  # all_masks = []
+  label_preds, label_gts = [], []
+  print(len(data_loader))
+  cc = 0
+  use_tensorboard = False
+  if hasattr(cfg.test, 'use_tensorboard'):
+    use_tensorboard = cfg.test.use_tensorboard
+  if use_tensorboard:
+    writer = tensorboard.SummaryWriter(log_dir=cfg.test.output_path)
+  for data in metric_logger.log_every(data_loader, 1, header):
+    _, image_paths, target_list, sentences_list = data
+    # print(type(target_lis))
+    if not isinstance(target_list, list):
+      target_list, sentences_list = [target_list], [sentences_list]
+    for target, sentences in zip(target_list, sentences_list):
+      image_path = image_paths[0]
+      # print(image_path)
+      if test_cfg.seg_mode == 'refer':
+        all_masks, all_scores = inference_car(
+            cfg, model, image_path, sentences, sam_pipeline=sam_pipeline
+        )
+        # final_mask = merge_masks(all_masks, *target.shape[1:])
+        final_mask = merge_masks_simple(
+            all_masks, *target.shape[1:], scores=all_scores
+        )
+        intersection, union, cur_iou = compute_iou(final_mask, target)
+        # cur_iou = IoU(final_mask, target, 0)
+        metric_logger.update(mIoU=cur_iou)
+        mean_iou.append(cur_iou)
+        if use_tensorboard:
+          writer.add_scalar('Mean IoU', cur_iou, cc)
+        cum_i += intersection
+        cum_u += union
+        for n_eval_iou in range(len(eval_seg_iou_list)):
+          eval_seg_iou = eval_seg_iou_list[n_eval_iou]
+          seg_correct[n_eval_iou] += cur_iou >= eval_seg_iou
+        seg_total += 1
+      elif test_cfg.seg_mode == 'semantic':
+        # torch.cuda.empty_cache()
+        label_pred = eval_semantic(
+            label_space,
+            test_cfg.algo,
+            cfg,
+            model,
+            image_path,
+            stuff_label_space,
+        )
+        label_gt = target.squeeze().cpu().numpy()
+        cur_iou = semantic_iou(
+            [label_gt],
+            [label_pred],
+            n_class=cfg.test.n_class,
+            ignore_background=cfg.test.ignore_background,
+        )['mIoU']
+        metric_logger.update(mIoU=cur_iou)
+        label_preds.append(label_pred)
+        label_gts.append(label_gt)
+    cc += 1
+  if test_cfg.seg_mode == 'refer':
+    mean_iou = np.array(mean_iou)
+    m_iou = np.mean(mean_iou)
+    if use_tensorboard:
+      writer.add_scalar('mIoU', m_iou.item(), len(data_loader))
+    print('Final results:')
+    print('Mean IoU is %.2f\n' % (m_iou * 100.0))
+    results_str = ''
+    for n_eval_iou in range(len(eval_seg_iou_list)):
+      results_str += '    precision@%s = %.2f\n' % (
+          str(eval_seg_iou_list[n_eval_iou]),
+          seg_correct[n_eval_iou] * 100.0 / seg_total,
+      )
+    o_iou = cum_i * 100.0 / cum_u
+    results_str += '    overall IoU = %.2f\n' % o_iou
+    if use_tensorboard:
+      writer.add_scalar('oIoU', o_iou, 0)
+    print(results_str)
+  elif test_cfg.seg_mode == 'semantic':
+    iou_score = semantic_iou(
+        label_gts,
+        label_preds,
+        n_class=cfg.test.n_class,
+        ignore_background=cfg.test.ignore_background,
+    )
+    if use_tensorboard:
+      writer.add_scalar('mIoU', iou_score['mIoU'].item(), len(data_loader))
+    print(iou_score)
+  if use_tensorboard:
+    writer.close()
+def compute_iou(pred_seg, gd_seg):
+  """Compute IoU."""
+  intersection = torch.sum(torch.logical_and(pred_seg, gd_seg))
+  union = torch.sum(torch.logical_or(pred_seg, gd_seg))
+  iou = intersection * 1.0 / union
+  if union == 0:
+    iou = 0
+  return intersection, union, iou
+def list_of_strings(arg):
+  return [a.strip() for a in arg.split(',')]
+# pylint: disable=redefined-outer-name
+def parse_args():
+  """Parse arguments."""
+  parser = argparse.ArgumentParser(description='Training')
+  parser.add_argument(
+      '--cfg-path',
+      default='configs/refcoco_test_prompt.yaml',
+      help='path to configuration file.',
+  )
+  parser.add_argument('--index', default=0, type=int, help='split task')
+  parser.add_argument('--mask_threshold', default=0.0, type=float)
+  parser.add_argument('--confidence_threshold', default=0.0, type=float)
+  parser.add_argument('--clipes_threshold', default=0.0, type=float)
+  parser.add_argument('--stuff_bg_factor', default=0.0, type=float)
+  parser.add_argument('--bg_factor', default=0.0, type=float)
+  parser.add_argument('--output_path', default=None, type=str)
+  parser.add_argument(
+      '--visual_prompt_type', default=None, type=list_of_strings
+  )
+  parser.add_argument(
+      '--stuff_visual_prompt_type', default=None, type=list_of_strings
+  )
+  args = parser.parse_args()
+  return args
+def main(args):
+  cfg = Config(**load_yaml(args.cfg_path))
+  if args.mask_threshold > 0:
+    cfg.car.mask_threshold = args.mask_threshold
+  if args.confidence_threshold > 0:
+    cfg.car.confidence_threshold = args.confidence_threshold
+  if args.clipes_threshold > 0:
+    cfg.car.clipes_threshold = args.clipes_threshold
+  if args.bg_factor > 0:
+    cfg.car.bg_factor = args.bg_factor
+  if args.stuff_bg_factor > 0:
+    cfg.car.stuff_bg_factor = args.stuff_bg_factor
+  if args.output_path is not None:
+    cfg.test.output_path = args.output_path
+  if args.visual_prompt_type is not None:
+    cfg.car.visual_prompt_type = args.visual_prompt_type
+  if args.stuff_visual_prompt_type is not None:
+    cfg.car.stuff_visual_prompt_type = args.stuff_visual_prompt_type
+  try:
+    data_root = cfg.test.data_root
+  except ValueError:
+    data_root = None
+  dataset_test = get_dataset(
+      cfg, cfg.test.ds_name, cfg.test.split, get_transform(), data_root
+  )
+  device = 'cuda' if torch.cuda.is_available() else 'cpu'
+  stuff_label_space = None
+  if cfg.test.ds_name == 'voc':
+    label_space = VOC_CLASSES
+  elif cfg.test.ds_name == 'cocostuff':
+    label_space = COCO_OBJECT_CLASSES
+  elif cfg.test.ds_name == 'context':
+    # label_space = PASCAL_CONTEXT_CLASSES
+    label_space = PASCAL_CONTEXT_THING_CLASS
+    stuff_label_space = PASCAL_CONTEXT_STUFF_CLASS
+  elif cfg.test.ds_name == 'ade':
+    label_space = ADE_THING_CLASS
+    stuff_label_space = ADE_STUFF_CLASS
+  elif cfg.test.ds_name == 'pascal_459':
+    label_space = PASCAL_459_THING_CLASS
+    stuff_label_space = PASCAL_459_STUFF_CLASS
+  elif cfg.test.ds_name == 'ade_847':
+    label_space = ADE_847_THING_CLASS
+    stuff_label_space = ADE_847_STUFF_CLASS
+  else:
+    label_space = None
+  num_chunks, chunk_index = 1, 0
+  if hasattr(cfg.test, 'num_chunks'):
+    num_chunks = cfg.test.num_chunks
+  if hasattr(cfg.test, 'chunk_index'):
+    chunk_index = cfg.test.chunk_index
+  # Size of each chunk
+  chunk_size = len(dataset_test) // num_chunks
+  # Choose which chunk to load (0-indexed)
+  # Define a subset of the dataset
+  subset_indices = range(
+      chunk_index * chunk_size, (chunk_index + 1) * chunk_size
+  )
+  subset_dataset = Subset(dataset_test, indices=subset_indices)
+  data_loader_test = torch.utils.data.DataLoader(
+      subset_dataset, batch_size=1, shuffle=False, num_workers=1
+  )
+  car_model = CaR(cfg, device=device, seg_mode=cfg.test.seg_mode)
+  car_model = car_model.to(device)
+  if not cfg.test.use_pseudo and cfg.test.sam_mask_root is None:
+    print('Using sam online')
+    # sam_checkpoint, model_type = build_sam_config(cfg)
+    build_sam_config(cfg)
+  evaluate(
+      data_loader_test,
+      cfg,
+      car_model,
+      test_cfg=cfg.test,
+      label_space=label_space,
+      stuff_label_space=stuff_label_space,
+  )
+if __name__ == '__main__':
+  args = parse_args()
+  main(args)

modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

modeling/model/cam.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Get CAM activation."""
+import cv2
+import numpy as np
+import torch
+_EPSILON = 1e-15
+def scale_cam_image(cam, target_size=None):
+  """Normalize and rescale cam image."""
+  result = []
+  for img in cam:
+    img = img - np.min(img)
+    img = img / (_EPSILON + np.max(img))
+    if target_size is not None:
+      img = cv2.resize(img, target_size)
+    result.append(img)
+  result = np.float32(result)
+  return result
+class ActivationsAndGradients:
+  """Class for extracting activations and registering gradients from targetted intermediate layers."""
+  def __init__(self, model, target_layers, reshape_transform, stride=16):
+    self.model = model
+    self.gradients = []
+    self.activations = []
+    self.reshape_transform = reshape_transform
+    self.handles = []
+    self.stride = stride
+    for target_layer in target_layers:
+      self.handles.append(
+          target_layer.register_forward_hook(self.save_activation)
+      )
+      # Because of https://github.com/pytorch/pytorch/issues/61519,
+      # we don't use backward hook to record gradients.
+      self.handles.append(
+          target_layer.register_forward_hook(self.save_gradient)
+      )
+  # pylint: disable=unused-argument
+  # pylint: disable=redefined-builtin
+  def save_activation(self, module, input, output):
+    """Saves activations from targetted layer."""
+    activation = output
+    if self.reshape_transform is not None:
+      activation = self.reshape_transform(activation, self.height, self.width)
+    self.activations.append(activation.cpu().detach())
+  def save_gradient(self, module, input, output):
+    if not hasattr(output, "requires_grad") or not output.requires_grad:
+      # You can only register hooks on tensor requires grad.
+      return
+    # Gradients are computed in reverse order
+    def _store_grad(grad):
+      if self.reshape_transform is not None:
+        grad = self.reshape_transform(grad, self.height, self.width)
+      self.gradients = [grad.cpu().detach()] + self.gradients
+    output.register_hook(_store_grad)
+  # pylint: enable=unused-argument
+  # pylint: enable=redefined-builtin
+  def __call__(self, x, h, w):
+    self.height = h // self.stride
+    self.width = w // self.stride
+    self.gradients = []
+    self.activations = []
+    if isinstance(x, tuple) or isinstance(x, list):
+      return self.model.forward_last_layer(x[0], x[1])
+    else:
+      return self.model(x)
+  def release(self):
+    for handle in self.handles:
+      handle.remove()
+# pylint: disable=g-bare-generic
+class CAM:
+  """CAM module."""
+  def __init__(
+      self,
+      model,
+      target_layers,
+      use_cuda=False,
+      reshape_transform=None,
+      compute_input_gradient=False,
+      stride=16,
+  ):
+    self.model = model.eval()
+    self.target_layers = target_layers
+    self.cuda = use_cuda
+    self.model = model.cuda() if self.cuda else self.model
+    self.reshape_transform = reshape_transform
+    self.compute_input_gradient = compute_input_gradient
+    self.activations_and_grads = ActivationsAndGradients(
+        self.model, target_layers, reshape_transform, stride=stride
+    )
+  def get_cam(self, activations, grads):
+    weights = np.mean(grads, axis=(2, 3))
+    weighted_activations = weights[:, :, None, None] * activations
+    cam = weighted_activations.sum(axis=1)
+    return cam
+  def forward(
+      self,
+      input_tensor,
+      targets,
+      target_size,
+  ):
+    """CAM forward pass."""
+    if self.compute_input_gradient:
+      input_tensor = torch.autograd.Variable(input_tensor, requires_grad=True)
+    w, h = self.get_target_width_height(input_tensor)
+    outputs = self.activations_and_grads(input_tensor, h, w)
+    self.model.zero_grad()
+    if isinstance(input_tensor, (tuple, list)):
+      loss = sum(
+          [target(output[0]) for target, output in zip(targets, outputs)]
+      )
+    else:
+      loss = sum([target(output) for target, output in zip(targets, outputs)])
+    loss.backward(retain_graph=True)
+    cam_per_layer = self.compute_cam_per_layer(target_size)
+    if isinstance(input_tensor, (tuple, list)):
+      return (
+          self.aggregate_multi_layers(cam_per_layer),
+          outputs[0],
+          outputs[1],
+      )
+    else:
+      return self.aggregate_multi_layers(cam_per_layer), outputs
+  def get_target_width_height(self, input_tensor):
+    width = None
+    height = None
+    if isinstance(input_tensor, (tuple, list)):
+      width, height = input_tensor[-1], input_tensor[-2]
+    return width, height
+  def compute_cam_per_layer(self, target_size):
+    """Computes cam per target layer."""
+    activations_list = [
+        a.cpu().data.numpy() for a in self.activations_and_grads.activations
+    ]
+    grads_list = [
+        g.cpu().data.numpy() for g in self.activations_and_grads.gradients
+    ]
+    cam_per_target_layer = []
+    # Loop over the saliency image from every layer
+    for i in range(len(self.target_layers)):
+      layer_activations = None
+      layer_grads = None
+      if i < len(activations_list):
+        layer_activations = activations_list[i]
+      if i < len(grads_list):
+        layer_grads = grads_list[i]
+      cam = self.get_cam(layer_activations, layer_grads)
+      cam = np.maximum(cam, 0).astype(np.float32)  # float16->32
+      scaled = scale_cam_image(cam, target_size)
+      cam_per_target_layer.append(scaled[:, None, :])
+    return cam_per_target_layer
+  def aggregate_multi_layers(self, cam_per_target_layer):
+    cam_per_target_layer = np.concatenate(cam_per_target_layer, axis=1)
+    cam_per_target_layer = np.maximum(cam_per_target_layer, 0)
+    result = np.mean(cam_per_target_layer, axis=1)
+    return scale_cam_image(result)
+  def __call__(
+      self,
+      input_tensor,
+      targets=None,
+      target_size=None,
+  ):
+    return self.forward(input_tensor, targets, target_size)
+  def __del__(self):
+    self.activations_and_grads.release()
+  def __enter__(self):
+    return self
+  def __exit__(self, exc_type, exc_value, exc_tb):
+    self.activations_and_grads.release()
+    if isinstance(exc_value, IndexError):
+      # Handle IndexError here...
+      print(
+          f"An exception occurred in CAM with block: {exc_type}. "
+          f"Message: {exc_value}"
+      )
+      return True

modeling/model/car.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implementation of CaR."""
+import os
+import clip
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+# pylint: disable=g-importing-member
+# pylint: disable=g-bad-import-order
+from modeling.model.clip_wrapper import CLIPWrapper
+from modeling.model.clip_wrapper import forward_clip
+from modeling.model.clipcam import CLIPCAM
+from modeling.model.crf import PostProcess
+from modeling.model.utils import apply_visual_prompts
+from utils.visualize import viz_attn
+class CaR(nn.Module):
+  """CaR module."""
+  def __init__(
+      self,
+      cfg,
+      device="cpu",
+      visualize=False,
+      confidence_threshold=0.45,
+      save_path="save_path",
+      seg_mode="refer",
+      semantic_clip_model_name=None,
+      semantic_pretrained_data=None,
+      semantic_templates=None,
+      text_template=None,
+      visual_prompt_type="circle",
+      clipes_threshold=0.4,
+      cam_text_template="a clean origami {}.",
+      bg_cls=None,
+      iom_thres=0.6,
+      min_pred_threshold=0.01,
+      bg_factor=1.0,
+      mask_threshold=0.5,
+  ):
+    """CaR model for image segmentation.
+    Args:
+        cfg: the config file.
+        device: the device to run the model.
+        visualize: whether to visualize the intermediate results
+        confidence_threshold: the confidence threshold for semantic
+          segmentation. If the confidence score is lower than the threshold, the
+          mask will be discarded.
+        save_path: the path to save the intermediate results
+        seg_mode: the segmentation mode, can be 'refer' or 'semantic'
+        semantic_clip_model_name: the name of the semantic segmentation model.
+        semantic_pretrained_data: the path to the pretrained semantic
+          segmentation model.
+        semantic_templates: the templates for semantic segmentation.
+        text_template: the template for visual prompting.
+        visual_prompt_type: the type of visual prompting.
+        clipes_threshold: the threshold for CLIPES.
+        cam_text_template: the template for CAM.
+        bg_cls: background classes.
+        iom_thres: IoM threshold.
+        min_pred_threshold: Prediction threshold.
+        bg_factor: Background factor.
+        mask_threshold: Mask threshold.
+    """
+    super(CaR, self).__init__()
+    # CLIP parameters
+    self.confidence_threshold = confidence_threshold
+    self.device = device
+    self.visualize = visualize
+    self.save_path = save_path
+    self.seg_mode = seg_mode
+    self.semantic_clip_model_name = semantic_clip_model_name
+    self.semantic_pretrained_data = semantic_pretrained_data
+    self.semantic_templates = semantic_templates
+    self.text_template = text_template
+    self.visual_prompt_type = visual_prompt_type
+    self.clipes_threshold = clipes_threshold
+    self.cam_text_template = cam_text_template
+    self.iom_thres = iom_thres
+    self.min_pred_threshold = min_pred_threshold
+    self.bg_cls = bg_cls
+    self.bg_factor = bg_factor
+    self.mask_threshold = mask_threshold
+    if not hasattr(cfg, "clip"):
+      raise ValueError("The config file should contain the CLIP parameters.")
+    if not hasattr(cfg, "car"):
+      raise ValueError("The config file should contain the car parameters.")
+    if hasattr(cfg, "cam"):
+      raise ValueError("cfg.cam is deprecated, please use cfg.car ")
+    for k, v in vars(cfg.clip).items():
+      setattr(self, k, v)
+    for k, v in vars(cfg.car).items():
+      setattr(self, k, v)
+    if hasattr(cfg, "sam"):
+      for k, v in vars(cfg.sam).items():
+        setattr(self, k, v)
+    if not self.bg_cls:
+      self.bg_cls = None
+    print(f"The model is running on {self.device}")
+    self.clip_model, self.preprocess = clip.load(
+        self.clip_model_name, device=self.device
+    )
+    self.clip_model = CLIPWrapper(self.clip_model)
+    self.post_process = PostProcess(device=self.device)
+    self.mask_generator = CLIPCAM(
+        self.clip_model,
+        device=self.device,
+        text_template=self.text_template,
+        threshold=self.clipes_threshold,
+        bg_cls=self.bg_cls,
+    )
+    self.semantic_clip_model, self.semantic_preprocess = clip.load(
+        self.semantic_clip_model_name, device=self.device
+    )
+    self.semantic_clip_model = CLIPWrapper(self.semantic_clip_model)
+  def get_confidence(self, cam_map, binary_cam_map):
+    confidence_map = torch.sum(cam_map * binary_cam_map[None], dim=[2, 3])
+    confidence_map = confidence_map / torch.sum(binary_cam_map, dim=[1, 2])
+    confidence_score = confidence_map.squeeze()
+    return confidence_score
+  def set_visual_prompt_type(self, visual_prompt_type):
+    self.visual_prompt_type = visual_prompt_type
+  def set_bg_factor(self, bg_factor):
+    self.bg_factor = bg_factor
+  def set_confidence_threshold(self, confidence_threshold):
+    self.confidence_threshold = confidence_threshold
+  def set_mask_threshold(self, mask_threshold):
+    self.mask_threshold = mask_threshold
+  def apply_visual_prompts(self, image, mask):
+    if torch.sum(mask).item() <= 1:
+      return image
+    image_array = np.array(image)
+    img_h = image_array.shape[0]
+    img_w = image_array.shape[1]
+    mask = (
+        F.interpolate(mask[None][None], size=(img_h, img_w), mode="nearest")
+        .squeeze()
+        .detach()
+        .cpu()
+        .numpy()
+    )
+    mask = (mask > self.mask_threshold).astype(np.uint8)
+    prompted_image = apply_visual_prompts(
+        image_array, mask, self.visual_prompt_type, self.visualize
+    )
+    return prompted_image
+  def get_mask_confidence(self, prompted_images, prompt_text):
+    """Get the confidene for each mask with visual prompting."""
+    # get the center, width and height of the mask
+    prompted_tensor = torch.stack(
+        [self.semantic_preprocess(img) for img in prompted_images], dim=0
+    )
+    prompted_tensor = prompted_tensor.to(self.device)
+    h, w = prompted_tensor.shape[-2:]
+    text_prediction = forward_clip(
+        self.semantic_clip_model, prompted_tensor, prompt_text, h, w
+    )
+    return text_prediction
+  def _filter_texts(self, ori_mask_id, sem_scores, prompt_text):
+    """Remove false positive masks by score filtering and recall the backbone to get the CAM maps for the filtered texts."""
+    if not ori_mask_id:
+      max_id = np.argmax(sem_scores)
+      ori_mask_id.append(max_id)
+    filtered_text = [prompt_text[i] for i in ori_mask_id]
+    return filtered_text
+  def _forward_stage(self, ori_img, cam_text, clip_text, semantic_prompt_text):
+    mask_proposals = self.get_mask_proposals(ori_img, cam_text)
+    num_texts = len(cam_text)
+    ori_mask_id = []
+    sem_scores = torch.zeros((num_texts,), device=self.device).float()
+    prompted_imgs = [
+        self.apply_visual_prompts(ori_img, cam_map)
+        for cam_map in mask_proposals
+    ]
+    text_scores = self.get_mask_confidence(prompted_imgs, semantic_prompt_text)
+    mask_scores = torch.diagonal(text_scores)
+    for mask_idx, mask_score in enumerate(mask_scores):
+      # record mask idx
+      if mask_score > self.confidence_threshold:
+        ori_mask_id.append(mask_idx)
+      sem_scores[mask_idx] = mask_score
+    sem_scores = sem_scores.cpu().detach().numpy()
+    filtered_texts = self._filter_texts(ori_mask_id, sem_scores, clip_text)
+    # if isinstance(ori_img, list):
+    #   ori_img = [ori_img[i] for i in ori_mask_id]
+    all_scores = torch.zeros((num_texts,), device=self.device).float()
+    sem_scores = torch.from_numpy(sem_scores).to(self.device)
+    for new_id, ori_id in enumerate(ori_mask_id):
+      if new_id >= len(mask_proposals):
+        # the mask is filtered out.
+        continue
+      all_scores[ori_id] = sem_scores[ori_id]
+    return filtered_texts, all_scores, mask_proposals
+  def _get_save_path(self, text):
+    folder_name = "_".join([t.replace(" ", "_") for t in text])
+    if len(folder_name) > 20:
+      folder_name = folder_name[:20]
+    output_path = os.path.join(self.save_path, folder_name)
+    sub_output_path = [
+        os.path.join(output_path, t.replace(" ", "_")) for t in text
+    ]
+    return output_path, sub_output_path
+  def get_mask_proposals(self, img, text):
+    if self.seg_mode == "refer":
+      if isinstance(img, list):
+        cam_map_list = [self.mask_generator(i, t)[0] for i, t in zip(img, text)]
+      else:
+        cam_map_list = [self.mask_generator(img, t)[0] for t in text]
+      return torch.cat(cam_map_list, dim=0)
+    elif self.seg_mode == "semantic":
+      return self.mask_generator(img, text)[0]
+    else:
+      raise ValueError(
+          "Unknown segmentation mode. Only refer and semantic segmentation are"
+          " supported."
+      )
+  def _forward_car(self, ori_img, text):
+    if isinstance(text, str):
+      text = [text]
+    _, sub_output_path = self._get_save_path(text)
+    image_array = np.array(ori_img)
+    clip_text = [self.cam_text_template.format(t) for t in text]
+    cam_text = text
+    init_clip_text = clip_text  # the text prompts of CLIP is different.
+    semantic_prompt_text = clip_text
+    # Apply semantic prompting augmentation.
+    if self.semantic_templates is not None:
+      semantic_prompt_text = []
+      for template in self.semantic_templates:
+        templated_text = [template.format(t) for t in text]
+        semantic_prompt_text.append(templated_text)
+    num_positive_last = 0
+    run = 0
+    while True:
+      run += 1
+      cur_texts, all_scores, mask_proposals = self._forward_stage(
+          ori_img, cam_text, clip_text, semantic_prompt_text
+      )
+      if cur_texts:  # if there is no text, skip the refinement
+        cam_text = cur_texts
+        clip_text = cur_texts
+      num_positive = (all_scores > 0).sum().item()
+      if num_positive == num_positive_last:
+        # stop the refinement if the number of positive masks
+        # does not change.
+        break
+      num_positive_last = num_positive
+    # Apply densecrf for refinement.
+    # SAM is optional and is applied outside the model.
+    refined_masks = self.post_process(
+        ori_img,
+        mask_proposals,
+        separate=self.seg_mode == "refer",
+        bg_factor=self.bg_factor,
+    )
+    predicted_class_idx = [init_clip_text.index(t) for t in cur_texts]
+    if self.visualize:
+      _ = [
+          viz_attn(
+              image_array,
+              attn,
+              prefix=sub_output_path[aid],
+              img_name="semantic_mask",
+          )
+          for aid, attn in enumerate(refined_masks)
+      ]
+    final_predicted_masks = torch.zeros(len(text), *refined_masks[0].shape)
+    final_all_scores = torch.zeros(len(text))
+    for idx, mask, score in zip(predicted_class_idx, refined_masks, all_scores):
+      final_predicted_masks[idx] = mask
+      final_all_scores[idx] = score
+    return final_predicted_masks, final_all_scores
+  def forward(self, im_ori, text):
+    # raw_image_np is the padded image input with shape (512, 512, 3)
+    pseudo_masks, conf_scores = self._forward_car(im_ori, text)
+    return pseudo_masks, conf_scores

modeling/model/clip_wrapper.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A wrapper for CLIP model to support forward with a list of text inputs."""
+# pylint: disable=g-importing-member
+import clip
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+_CONTEXT_LENGTH = 77
+def forward_clip_single(model, image, text, h, w):
+  """Forward a single text input.
+  Args:
+      model (CLIPWrapper or CLIP): the CLIP model.
+      image (torch.Tensor): the image tensor.
+      text (List[str]): the text input.
+      h (int): the height of the image.
+      w (int): the width of the image.
+  Returns:
+      torch.Tensor: the logits.
+  """
+  if isinstance(text, str):
+    text = [text]
+  text_tokens = clip.tokenize(text).to(image.device)
+  text_prediction = model(image, text_tokens, h, w)
+  return text_prediction.detach().cpu()
+def forward_clip(model, image, text, h, w):
+  """Forward a list of text inputs.
+  Args:
+      model (CLIPWrapper or CLIP): the CLIP model.
+      image (torch.Tensor): the image tensor.
+      text (List[str] or List[List[str]]): the text input.
+      h (int): the height of the image.
+      w (int): the width of the image.
+  Returns:
+      torch.Tensor: the logits.
+  """
+  if isinstance(text[0], list):
+    text_prediction = torch.stack(
+        [forward_clip_single(model, image, t, h, w) for t in text], dim=0
+    )
+    text_prediction = torch.sum(text_prediction, dim=0)
+    text_prediction = F.softmax(text_prediction.float(), dim=-1)
+  else:
+    text_prediction = forward_clip_single(model, image, text, h, w)
+  return text_prediction.float()
+def upsample_position_embedding(embed, new_size):
+  """Upsample the pretrained embedding to a higher resolution.
+  Args:
+      embed (torch.Tensor): the pretrained embedding.
+      new_size (Tuple[int, int]): the new size of the embedding.
+  Returns:
+      torch.Tensor: the upsampled embedding.
+  """
+  # emb size NxD
+  first = embed[:1, :]
+  embed = embed[1:, :]
+  n = embed.size(0)
+  d = embed.size(1)
+  size = int(np.sqrt(n))
+  if size * size != n:
+    raise ValueError(f'The size of embed {n} is not a perfect square number.')
+  # new_size = size * self.upsample
+  embed = embed.permute(1, 0)
+  embed = embed.view(1, d, size, size).contiguous()
+  embed = F.upsample(
+      embed,
+      size=new_size,
+      mode='bilinear',
+  )
+  embed = embed.view(d, -1).contiguous()
+  embed = embed.permute(1, 0)
+  embed = torch.cat([first, embed], 0)
+  embed = nn.parameter.Parameter(embed.half())
+  return embed
+class CustomBlock(nn.Module):
+  """A customized attention block."""
+  def __init__(self, block):
+    super().__init__()
+    for k, v in vars(block).items():
+      setattr(self, k, v)
+  def attention(self, x):
+    self.attn_mask = (
+        self.attn_mask.to(dtype=x.dtype, device=x.device)
+        if self.attn_mask is not None
+        else None
+    )
+    self.attn = self.attn.to(dtype=x.dtype, device=x.device)
+    # Setting need_weights to True also returns the attention weights
+    return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)
+  def forward(self, x):
+    # attn_output: (L,N,E), attn_weight: (N,L,L)
+    attn_output, attn_weight = self.attention(self.ln_1(x))
+    x = x + attn_output
+    x = x + self.mlp(self.ln_2(x))
+    return x, attn_weight
+class CustomTransformer(nn.Module):
+  """A customized Transformer to support CAM calculation."""
+  def __init__(self, transformer):
+    """Initialize the wrapper.
+    Args:
+        transformer (nn.Module): the Transformer to be wrapped.
+    """
+    super().__init__()
+    for k, v in vars(transformer).items():
+      setattr(self, k, v)
+    self.resblocks = nn.Sequential(
+        *[CustomBlock(block) for block in self.resblocks]
+    )
+  def forward(self, x):
+    attn_weights = []
+    with torch.no_grad():
+      layers = self.layers if x.shape[0] == _CONTEXT_LENGTH else self.layers - 1
+      for i in range(layers):
+        x, attn_weight = self.resblocks[i](x)
+        attn_weights.append(attn_weight)
+    return x, attn_weights
+class CustomVisionTransformer(nn.Module):
+  """A customized VisionTransformer to support CAM calculation."""
+  def __init__(self, model):
+    """Initialize the wrapper.
+    Args:
+        model (VisionTransformer): the VisionTransformer to be wrapped.
+    """
+    super().__init__()
+    for k, v in vars(model).items():
+      setattr(self, k, v)
+    self.patch_size = self.conv1.kernel_size[0]
+    self.transformer = CustomTransformer(self.transformer)
+  def forward(self, x, h, w):
+    self.positional_embedding_new = upsample_position_embedding(
+        self.positional_embedding, (h // self.patch_size, w // self.patch_size)
+    )
+    # shape = [*, width, grid, grid]
+    x = self.conv1(x)
+    # shape = [*, width, grid ** 2]
+    x = x.reshape(x.shape[0], x.shape[1], -1)
+    # shape = [*, grid ** 2, width]
+    x = x.permute(0, 2, 1)
+    zeros = torch.zeros(
+        x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device
+    )
+    # shape = [*, grid ** 2 + 1, width]
+    x = torch.cat([self.class_embedding.to(x.dtype) + zeros, x], dim=1)
+    x = x + self.positional_embedding_new.to(x.dtype)
+    x = self.ln_pre(x)
+    # NLD -> LND
+    x = x.permute(1, 0, 2)
+    x, attn_weight = self.transformer(x)
+    return x, attn_weight
+class CLIPWrapper(nn.Module):
+  """A wrapper for CLIP to support forward with a list of text inputs."""
+  def __init__(self, clip_model):
+    """Initialize the wrapper.
+    Args:
+        clip_model (CLIP): the CLIP model to be wrapped.
+    """
+    super().__init__()
+    # copy all attributes from clip_model to self
+    for k, v in vars(clip_model).items():
+      setattr(self, k, v)
+    self.visual = CustomVisionTransformer(self.visual)
+    self.transformer = CustomTransformer(self.transformer)
+  @property
+  def dtype(self):
+    return self.visual.conv1.weight.dtype
+  def encode_image(self, image, h, w):
+    return self.visual(image.type(self.dtype), h, w)
+  def encode_text(self, text):
+    x = self.token_embedding(text).type(
+        self.dtype
+    )  # [batch_size, n_ctx, d_model]
+    x = x + self.positional_embedding.type(self.dtype)
+    x = x.permute(1, 0, 2)  # NLD -> LND
+    x, _ = self.transformer(x)
+    x = x.permute(1, 0, 2)  # LND -> NLD
+    x = self.ln_final(x).type(self.dtype)
+    # x.shape = [batch_size, n_ctx, transformer.width]
+    # take features from the eot embedding
+    # (eot_token is the highest number in each sequence)
+    x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+    return x
+  def pool_visual(self, x, use_cls_token=False):
+    if use_cls_token:
+      return x[:, 0]
+    else:
+      return torch.mean(x[:, 1:, :], dim=1)
+  def forward_last_layer(
+      self, image_features, text_features, use_cls_token=False, repeat_last=True
+  ):
+    """Forward the last layer of CLIP.
+    Args:
+        image_features (torch.Tensor): the image features.
+        text_features (torch.Tensor): the text features.
+        use_cls_token (bool, optional): whether to use the CLS token. Defaults
+          to False.
+        repeat_last (bool, optional): whether to repeat the last layer. Defaults
+          to True.
+    Returns:
+        torch.Tensor: the logits.
+        torch.Tensor: the attention weights.
+    """
+    if repeat_last:
+      x, attention_weight = self.visual.transformer.resblocks[
+          self.visual.transformer.layers - 1
+      ](image_features)
+    else:
+      x = image_features
+      attention_weight = None
+    x = x.permute(1, 0, 2)  # LND -> NLD
+    x = self.visual.ln_post(x)
+    x = self.pool_visual(x, use_cls_token=use_cls_token)
+    if self.visual.proj is not None:
+      x = x @ self.visual.proj
+    image_features = x
+    # normalized features
+    image_features = image_features / image_features.norm(dim=1, keepdim=True)
+    text_features = text_features / text_features.norm(dim=1, keepdim=True)
+    # cosine similarity as logits
+    logit_scale = self.logit_scale.exp()
+    logits_per_image = logit_scale * image_features @ text_features.t()
+    # shape = [global_batch_size, global_batch_size]
+    logits_per_image = F.softmax(logits_per_image.float(), dim=-1)
+    return logits_per_image, attention_weight
+  def forward(self, image, text, h=224, w=224):
+    with torch.no_grad():
+      text_features = self.encode_text(text)
+      feature_map, _ = self.visual(image.type(self.dtype), h, w)
+      logits_per_image, _ = self.forward_last_layer(
+          feature_map, text_features, use_cls_token=True, repeat_last=False
+      )
+    return logits_per_image

modeling/model/clipcam.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate CAM with CLIP model."""
+import warnings
+import clip
+import cv2
+import numpy as np
+import torch
+# pylint: disable=g-importing-member
+# pylint: disable=g-bad-import-order
+from modeling.model.cam import CAM
+from modeling.model.cam import scale_cam_image
+from modeling.model.utils import img_ms_and_flip
+from modeling.model.utils import reshape_transform
+from modeling.model.utils import scoremap2bbox
+warnings.filterwarnings("ignore")
+class ClipOutputTarget:
+  def __init__(self, category):
+    self.category = category
+  def __call__(self, model_output):
+    if len(model_output.shape) == 1:
+      return model_output[self.category]
+    return model_output[:, self.category]
+def zeroshot_classifier(classnames, templates, model, device):
+  """Zeroshot classifier."""
+  with torch.no_grad():
+    zeroshot_weights = []
+    for classname in classnames:
+      if templates is None:
+        texts = [classname]
+      else:
+        # format with class
+        texts = [template.format(classname) for template in templates]
+      texts = clip.tokenize(texts).to(device)  # tokenize
+      class_embeddings = model.encode_text(texts)  # embed with text encoder
+      class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
+      class_embedding = class_embeddings.mean(dim=0)
+      class_embedding /= class_embedding.norm()
+      zeroshot_weights.append(class_embedding)
+    zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(device)
+  return zeroshot_weights.t()
+class CLIPCAM:
+  """Generate CAM with CLIP model."""
+  def __init__(
+      self,
+      clip_model,
+      device,
+      text_template=None,
+      threshold=0.4,
+      bg_cls=None,
+  ):
+    self.device = device
+    self.clip_model = clip_model.to(device)
+    self.text_template = text_template
+    self.threshold = threshold
+    self.stride = self.clip_model.visual.patch_size
+    # if self.dataset_name == 'voc' else BACKGROUND_CATEGORY_COCO
+    self.bg_cls = bg_cls
+    self.bg_text_features = None
+    if self.bg_cls is not None:
+      self.bg_text_features = zeroshot_classifier(
+          self.bg_cls,
+          ("a clean origami {}.",),
+          self.clip_model,
+          self.device,
+      ).to(self.device)
+    self.target_layers = [self.clip_model.visual.transformer.resblocks[-1].ln_1]
+    self.cam = CAM(
+        model=self.clip_model,
+        target_layers=self.target_layers,
+        reshape_transform=reshape_transform,
+        use_cuda="cuda" in device,
+        stride=self.stride,
+    )
+  def set_bg_cls(self, bg_cls):
+    # if len(bg_cls) == 0:
+    if not bg_cls:
+      self.bg_cls = None
+      self.bg_text_features = None
+    else:
+      self.bg_cls = bg_cls
+      self.bg_text_features = zeroshot_classifier(
+          self.bg_cls,
+          ("a clean origami {}.",),
+          self.clip_model,
+          self.device,
+      ).to(self.device)
+  def __call__(self, ori_img, text, scale=1.0):
+    """Get CAM masks and features.
+    Args:
+      ori_img(Image): image to be searched.
+      text (str): text to be searched.
+      scale (float): image scale.
+    Returns:
+      CAM masks and features.
+    """
+    ori_width = ori_img.size[0]
+    ori_height = ori_img.size[1]
+    if isinstance(text, str):
+      text = [text]
+    # convert image to bgr channel
+    ms_imgs = img_ms_and_flip(ori_img, ori_height, ori_width, scales=[scale])
+    image = ms_imgs[0]
+    image = image.unsqueeze(0)
+    h, w = image.shape[-2], image.shape[-1]
+    image = image.to(self.device)
+    image_features, attn_weight_list = self.clip_model.encode_image(image, h, w)
+    highres_cam_to_save = []
+    refined_cam_to_save = []
+    # keys = []
+    # [bg_id_for_each_image[im_idx]].to(device_id)
+    bg_features_temp = None
+    if self.bg_text_features is not None:
+      bg_features_temp = self.bg_text_features.to(self.device)
+    fg_features_temp = zeroshot_classifier(
+        text, self.text_template, self.clip_model, self.device
+    ).to(self.device)
+    if bg_features_temp is None:
+      text_features_temp = fg_features_temp
+    else:
+      text_features_temp = torch.cat(
+          [fg_features_temp, bg_features_temp], dim=0
+      )
+    input_tensor = [
+        image_features,
+        text_features_temp.to(self.device),
+        h,
+        w,
+    ]
+    # for idx, label in enumerate(label_list):
+    # keys.append(new_class_names.index(label))
+    for idx, _ in enumerate(text):
+      targets = [ClipOutputTarget(idx)]
+      # torch.cuda.empty_cache()
+      grayscale_cam, _, attn_weight_last = self.cam(
+          input_tensor=input_tensor, targets=targets, target_size=None
+      )  # (ori_width, ori_height))
+      grayscale_cam = grayscale_cam[0, :]
+      if grayscale_cam.max() == 0:
+        input_tensor_fg = (
+            image_features,
+            fg_features_temp.to(self.device),
+            h,
+            w,
+        )
+        grayscale_cam, _, attn_weight_last = self.cam(
+            input_tensor=input_tensor_fg,
+            targets=targets,
+            target_size=None,
+        )
+        grayscale_cam = grayscale_cam[0, :]
+      grayscale_cam_highres = cv2.resize(grayscale_cam, (ori_width, ori_height))
+      highres_cam_to_save.append(torch.tensor(grayscale_cam_highres))
+      if idx == 0:
+        attn_weight_list.append(attn_weight_last)
+        attn_weight = [
+            aw[:, 1:, 1:] for aw in attn_weight_list
+        ]  # (b, hxw, hxw)
+        attn_weight = torch.stack(attn_weight, dim=0)[-8:]
+        attn_weight = torch.mean(attn_weight, dim=0)
+        attn_weight = attn_weight[0].cpu().detach()
+      attn_weight = attn_weight.float()
+      box, cnt = scoremap2bbox(
+          scoremap=grayscale_cam,
+          threshold=self.threshold,
+          multi_contour_eval=True,
+      )
+      aff_mask = torch.zeros((grayscale_cam.shape[0], grayscale_cam.shape[1]))
+      for i_ in range(cnt):
+        x0_, y0_, x1_, y1_ = box[i_]
+        aff_mask[y0_:y1_, x0_:x1_] = 1
+      aff_mask = aff_mask.view(
+          1, grayscale_cam.shape[0] * grayscale_cam.shape[1]
+      )
+      aff_mat = attn_weight
+      trans_mat = aff_mat / torch.sum(aff_mat, dim=0, keepdim=True)
+      trans_mat = trans_mat / torch.sum(trans_mat, dim=1, keepdim=True)
+      for _ in range(2):
+        trans_mat = trans_mat / torch.sum(trans_mat, dim=0, keepdim=True)
+        trans_mat = trans_mat / torch.sum(trans_mat, dim=1, keepdim=True)
+      trans_mat = (trans_mat + trans_mat.transpose(1, 0)) / 2
+      # This is copied from CLIP-ES
+      for _ in range(1):
+        trans_mat = torch.matmul(trans_mat, trans_mat)
+      trans_mat = trans_mat * aff_mask
+      cam_to_refine = torch.FloatTensor(grayscale_cam)
+      cam_to_refine = cam_to_refine.view(-1, 1)
+      # (n,n) * (n,1)->(n,1)
+      cam_refined = torch.matmul(trans_mat, cam_to_refine).reshape(
+          h // self.stride, w // self.stride
+      )
+      cam_refined = cam_refined.cpu().numpy().astype(np.float32)
+      cam_refined_highres = scale_cam_image(
+          [cam_refined], (ori_width, ori_height)
+      )[0]
+      refined_cam_to_save.append(torch.tensor(cam_refined_highres))
+      # post process the cam map
+      # label = process(raw_image, refined_cam, postprocessor)
+      # vis_img = vis_mask(np.asarray(raw_image), label, [0, 255, 0])
+      # vis_img.save(f'clip_es_crf_{idx}.jpg')
+    # keys = torch.tensor(keys)
+    # cam_all_scales.append(torch.stack(cam_to_save,dim=0))
+    cam_masks = torch.stack(refined_cam_to_save, dim=0)
+    return cam_masks.to(self.device), fg_features_temp.to(self.device)

modeling/model/crf.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DenseCRF."""
+import numpy as np
+from pydensecrf import densecrf as dcrf
+from pydensecrf import utils
+import torch
+import torch.nn.functional as F
+class DenseCRF(object):
+  """DenseCRF class."""
+  def __init__(self, iter_max, pos_w, pos_xy_std, bi_w, bi_xy_std, bi_rgb_std):
+    self.iter_max = iter_max
+    self.pos_w = pos_w
+    self.pos_xy_std = pos_xy_std
+    self.bi_w = bi_w
+    self.bi_xy_std = bi_xy_std
+    self.bi_rgb_std = bi_rgb_std
+  def __call__(self, image, probmap):
+    c, h, w = probmap.shape
+    u = utils.unary_from_softmax(probmap)
+    u = np.ascontiguousarray(u)
+    image = np.ascontiguousarray(image)
+    d = dcrf.DenseCRF2D(w, h, c)
+    d.setUnaryEnergy(u)
+    d.addPairwiseGaussian(sxy=self.pos_xy_std, compat=self.pos_w)
+    d.addPairwiseBilateral(
+        sxy=self.bi_xy_std,
+        srgb=self.bi_rgb_std,
+        rgbim=image,
+        compat=self.bi_w,
+    )
+    q = d.inference(self.iter_max)
+    q = np.array(q).reshape((c, h, w))
+    return q
+class PostProcess:
+  """Post processing with dense CRF."""
+  def __init__(self, device):
+    self.device = device
+    self.postprocessor = DenseCRF(
+        iter_max=10,
+        pos_xy_std=1,
+        pos_w=3,
+        bi_xy_std=67,
+        bi_rgb_std=3,
+        bi_w=4,
+    )
+  def apply_crf(self, image, cams, bg_factor=1.0):
+    """Apply dense CRF."""
+    bg_score = np.power(1 - np.max(cams, axis=0, keepdims=True), bg_factor)
+    cams = np.concatenate((bg_score, cams), axis=0)
+    prob = cams
+    image = image.astype(np.uint8).transpose(1, 2, 0)
+    prob = self.postprocessor(image, prob)
+    label = np.argmax(prob, axis=0)
+    label_tensor = torch.from_numpy(label).long()
+    refined_mask = F.one_hot(label_tensor).to(device=self.device)
+    refined_mask = refined_mask.permute(2, 0, 1)
+    refined_mask = refined_mask[1:].float()
+    return refined_mask
+  def __call__(self, image, cams, separate=False, bg_factor=1.0):
+    mean_bgr = (104.008, 116.669, 122.675)
+    # covert Image to numpy array
+    image = np.array(image).astype(np.float32)
+    # RGB -> BGR
+    image = image[:, :, ::-1]
+    # Mean subtraction
+    image -= mean_bgr
+    # HWC -> CHW
+    image = image.transpose(2, 0, 1)
+    if isinstance(cams, torch.Tensor):
+      cams = cams.cpu().detach().numpy()
+    if separate:
+      refined_mask = [
+          self.apply_crf(image, cam[None], bg_factor) for cam in cams
+      ]
+      refined_mask = torch.cat(refined_mask, dim=0)
+    else:
+      refined_mask = self.apply_crf(image, cams, bg_factor)
+    return refined_mask

modeling/model/utils.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CAM utils."""
+# pylint: disable=g-importing-member
+import os
+import cv2
+import numpy as np
+from PIL import Image
+from scipy.ndimage import binary_fill_holes
+import torch
+from torchvision.transforms import Compose
+from torchvision.transforms import Normalize
+from torchvision.transforms import Resize
+from torchvision.transforms import ToTensor
+# pylint: disable=g-import-not-at-top
+try:
+  from torchvision.transforms import InterpolationMode
+  BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+  BICUBIC = Image.BICUBIC
+_CONTOUR_INDEX = 1 if cv2.__version__.split('.')[0] == '3' else 0
+def _convert_image_to_rgb(image):
+  return image.convert('RGB')
+def _transform_resize(h, w):
+  return Compose([
+      Resize((h, w), interpolation=BICUBIC),
+      _convert_image_to_rgb,
+      ToTensor(),
+      Normalize(
+          (0.48145466, 0.4578275, 0.40821073),
+          (0.26862954, 0.26130258, 0.27577711),
+      ),
+  ])
+def img_ms_and_flip(image, ori_height, ori_width, scales=1.0, patch_size=16):
+  """Resizes and flips the image."""
+  if isinstance(scales, float):
+    scales = [scales]
+  all_imgs = []
+  for scale in scales:
+    preprocess = _transform_resize(
+        int(np.ceil(scale * int(ori_height) / patch_size) * patch_size),
+        int(np.ceil(scale * int(ori_width) / patch_size) * patch_size),
+    )
+    image = preprocess(image)
+    image_ori = image
+    image_flip = torch.flip(image, [-1])
+    all_imgs.append(image_ori)
+    all_imgs.append(image_flip)
+  return all_imgs
+def reshape_transform(tensor, height=28, width=28):
+  tensor = tensor.permute(1, 0, 2)
+  result = tensor[:, 1:, :].reshape(
+      tensor.size(0), height, width, tensor.size(2)
+  )
+  # Bring the channels to the first dimension, like in CNNs.
+  result = result.transpose(2, 3).transpose(1, 2)
+  return result
+def vis_mask(image, mask, mask_color):
+  # switch the height and width of image
+  # image = image.transpose(1, 0, 2)
+  if mask.shape[0] != image.shape[0] or mask.shape[1] != image.shape[1]:
+    mask = cv2.resize(mask, (image.shape[1], image.shape[0]))
+  fg = mask > 0.5
+  rgb = np.copy(image)
+  rgb[fg] = (rgb[fg] * 0.3 + np.array(mask_color) * 0.7).astype(np.uint8)
+  return Image.fromarray(rgb)
+def scoremap2bbox(scoremap, threshold, multi_contour_eval=False):
+  """Get bounding boxes from scoremap."""
+  height, width = scoremap.shape
+  scoremap_image = np.expand_dims((scoremap * 255).astype(np.uint8), 2)
+  while True:
+    _, thr_gray_heatmap = cv2.threshold(
+        src=scoremap_image,
+        thresh=int(threshold * np.max(scoremap_image)),
+        maxval=255,
+        type=cv2.THRESH_BINARY,
+    )
+    if thr_gray_heatmap.max() > 0 or threshold <= 0:
+      break
+    threshold -= 0.1
+  contours = cv2.findContours(
+      image=thr_gray_heatmap, mode=cv2.RETR_TREE, method=cv2.CHAIN_APPROX_SIMPLE
+  )[_CONTOUR_INDEX]
+  # if len(contours) == 0:
+  if not contours:
+    return np.asarray([[0, 0, 0, 0]]), 1
+  if not multi_contour_eval:
+    contours = [max(contours, key=cv2.contourArea)]
+  estimated_boxes = []
+  for contour in contours:
+    x, y, w, h = cv2.boundingRect(contour)
+    x0, y0, x1, y1 = x, y, x + w, y + h
+    x1 = min(x1, width - 1)
+    y1 = min(y1, height - 1)
+    estimated_boxes.append([x0, y0, x1, y1])
+  return np.asarray(estimated_boxes), len(contours)
+def mask2chw(arr):
+  # Find the row and column indices where the array is 1
+  rows, cols = np.where(arr == 1)
+  # Calculate center of the mask
+  center_y = int(np.mean(rows))
+  center_x = int(np.mean(cols))
+  # Calculate height and width of the mask
+  height = rows.max() - rows.min() + 1
+  width = cols.max() - cols.min() + 1
+  return (center_y, center_x), height, width
+def unpad(image_array, pad=None):
+  if pad is not None:
+    left, top, width, height = pad
+    image_array = image_array[top : top + height, left : left + width, :]
+  return image_array
+def apply_visual_prompts(
+    image_array,
+    mask,
+    visual_prompt_type=('circle',),
+    visualize=False,
+    color=(255, 0, 0),
+    thickness=1,
+    blur_strength=(15, 15),
+):
+  """Applies visual prompts to the image."""
+  prompted_image = image_array.copy()
+  if 'blur' in visual_prompt_type:
+    # blur the part out side the mask
+    # Blur the entire image
+    blurred = cv2.GaussianBlur(prompted_image.copy(), blur_strength, 0)
+    # Get the sharp region using the mask
+    sharp_region = cv2.bitwise_and(
+        prompted_image.copy(),
+        prompted_image.copy(),
+        mask=np.clip(mask, 0, 255).astype(np.uint8),
+    )
+    # Get the blurred region using the inverted mask
+    inv_mask = 1 - mask
+    blurred_region = (blurred * inv_mask[:, :, None]).astype(np.uint8)
+    # Combine the sharp and blurred regions
+    prompted_image = cv2.add(sharp_region, blurred_region)
+  if 'gray' in visual_prompt_type:
+    gray = cv2.cvtColor(prompted_image.copy(), cv2.COLOR_BGR2GRAY)
+    # make gray part 3 channel
+    gray = np.stack([gray, gray, gray], axis=-1)
+    # Get the sharp region using the mask
+    color_region = cv2.bitwise_and(
+        prompted_image.copy(),
+        prompted_image.copy(),
+        mask=np.clip(mask, 0, 255).astype(np.uint8),
+    )
+    # Get the blurred region using the inverted mask
+    inv_mask = 1 - mask
+    gray_region = (gray * inv_mask[:, :, None]).astype(np.uint8)
+    # Combine the sharp and blurred regions
+    prompted_image = cv2.add(color_region, gray_region)
+  if 'black' in visual_prompt_type:
+    prompted_image = cv2.bitwise_and(
+        prompted_image.copy(),
+        prompted_image.copy(),
+        mask=np.clip(mask, 0, 255).astype(np.uint8),
+    )
+  if 'circle' in visual_prompt_type:
+    mask_center, mask_height, mask_width = mask2chw(mask)
+    center_coordinates = (mask_center[1], mask_center[0])
+    axes_length = (mask_width // 2, mask_height // 2)
+    prompted_image = cv2.ellipse(
+        prompted_image,
+        center_coordinates,
+        axes_length,
+        0,
+        0,
+        360,
+        color,
+        thickness,
+    )
+  if 'rectangle' in visual_prompt_type:
+    mask_center, mask_height, mask_width = mask2chw(mask)
+    # center_coordinates = (mask_center[1], mask_center[0])
+    # axes_length = (mask_width // 2, mask_height // 2)
+    start_point = (
+        mask_center[1] - mask_width // 2,
+        mask_center[0] - mask_height // 2,
+    )
+    end_point = (
+        mask_center[1] + mask_width // 2,
+        mask_center[0] + mask_height // 2,
+    )
+    prompted_image = cv2.rectangle(
+        prompted_image, start_point, end_point, color, thickness
+    )
+  if 'contour' in visual_prompt_type:
+    # Find the contours of the mask
+    # fill holes for the mask
+    mask = binary_fill_holes(mask)
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
+    )
+    # Draw the contours on the image
+    prompted_image = cv2.drawContours(
+        prompted_image.copy(), contours, -1, color, thickness
+    )
+  if visualize:
+    cv2.imwrite(os.path.join('masked_img.png'), prompted_image)
+  prompted_image = Image.fromarray(prompted_image.astype(np.uint8))
+  return prompted_image

modeling/model/utils_test.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file contains the unit tests for the utils.py file."""
+import numpy as np
+from PIL import Image
+import torch
+# pylint: disable=g-bad-import-order
+from modeling.model import utils
+def test_scoremap2bbox():
+  """Test the scoremap2bbox function."""
+  scoremap = np.zeros((10, 10))
+  scoremap[1:5, 1:5] = 1
+  scoremap[5:9, 5:9] = 2
+  scoremap[5:9, 1:5] = 3
+  scoremap[1:5, 5:9] = 4
+  bbox, len_bboxes = utils.scoremap2bbox(scoremap, 0.5)
+  assert len_bboxes == 1
+  assert bbox[0, 0] == 1
+  assert bbox[0, 1] == 1
+  assert bbox[0, 2] == 9
+  assert bbox[0, 3] == 9
+def test_mask2chw():
+  """Test the mask2chw function."""
+  mask = np.zeros((10, 10))
+  mask[1:5, 1:5] = 1
+  mask[5:9, 5:9] = 2
+  mask[5:9, 1:5] = 3
+  mask[1:5, 5:9] = 4
+  mask = torch.tensor(mask)
+  mask_center, mask_height, mask_width = utils.mask2chw(mask)
+  assert len(mask_center) == 2
+  assert mask_center[0] == 2
+  assert mask_center[1] == 2
+  assert mask_height == 4
+  assert mask_width == 4
+def test_unpad():
+  """Test the unpad function."""
+  image = np.zeros((10, 10, 1))
+  image[1:5, 1:5] = 1
+  image[5:9, 5:9] = 2
+  image[5:9, 1:5] = 3
+  image[1:5, 5:9] = 4
+  unpad_image = utils.unpad(image, pad=(1, 1, 8, 8))
+  assert len(unpad_image[0]) == 8, 'The width of the image is not 8.'
+  assert len(unpad_image[1]) == 8, 'The height of the image is not 8.'
+  unpad_image = utils.unpad(image, None)
+  assert (unpad_image == image).sum() == 100
+def test_apply_visual_prompts():
+  """Test the apply_visual_prompts function."""
+  image = np.ones((5, 5))
+  mask = np.array([
+      [0, 0, 0, 0, 0],
+      [0, 0, 0, 0, 0],
+      [0, 0, 1.0, 0, 0],
+      [0, 0, 0, 0, 0],
+      [0, 0, 0, 0, 0],
+  ])
+  target = np.array([
+      [1, 1, 255, 1, 1],
+      [1, 255, 1, 255, 1],
+      [255, 1, 1, 1, 255],
+      [1, 255, 1, 255, 1],
+      [1, 1, 255, 1, 1],
+  ])
+  mask[1:5, 1:5] = 1
+  prompted_image = utils.apply_visual_prompts(
+      image, mask, visual_prompt_type='circle', thickness=1
+  )
+  prompted_array = np.array(prompted_image)
+  assert (prompted_array == target).sum() == 25
+def test_reshape_transform():
+  """Test the reshape_transform function."""
+  image = torch.zeros((101, 10, 32))
+  image = utils.reshape_transform(image, height=10, width=10)
+  b, c, h, w = image.shape
+  assert b == 10
+  assert c == 32
+  assert h == 10
+  assert w == 10
+def test_img_ms_and_flip():
+  """Test the img_ms_and_flip function."""
+  image = np.zeros((120, 150))
+  image[1:5, 1:5] = 1
+  image[5:9, 5:9] = 2
+  image[5:9, 1:5] = 3
+  image[1:5, 5:9] = 4
+  image = Image.fromarray(image)
+  image = utils.img_ms_and_flip(image, 120, 150, scales=[1.2], patch_size=16)
+  image = image[0]
+  h, w = image.shape[-2:]
+  assert h == int(np.ceil(1.2 * 120 / 16) * 16)
+  assert w == int(np.ceil(1.2 * 150 / 16) * 16)
+if __name__ == '__main__':
+  test_scoremap2bbox()
+  test_mask2chw()
+  test_unpad()
+  test_apply_visual_prompts()
+  test_reshape_transform()
+  test_img_ms_and_flip()

modeling/post_process/object_discovery.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Find objects."""
+# pylint: disable=g-importing-member
+import numpy as np
+import scipy
+from scipy import ndimage
+from scipy.linalg import eigh
+from scipy.ndimage import label
+import torch
+import torch.nn.functional as F
+def ncut(
+    feats,
+    dims,
+    scales,
+    init_image_size,
+    tau=0,
+    eps=1e-5,
+    no_binary_graph=False,
+):
+  """Implementation of NCut Method.
+  Args:
+    feats: the pixel/patche features of an image
+    dims: dimension of the map from which the features are used
+    scales: from image to map scale
+    init_image_size: size of the image
+    tau: thresold for graph construction
+    eps: graph edge weight
+    no_binary_graph: ablation study for using similarity score as graph
+      edge weight
+  Returns:
+    TODO
+  """
+  feats = feats[0, 1:, :]
+  feats = F.normalize(feats, p=2)
+  a = feats @ feats.transpose(1, 0)
+  a = a.cpu().numpy()
+  if no_binary_graph:
+    a[a < tau] = eps
+  else:
+    a = a > tau
+    a = np.where(a.astype(float) == 0, eps, a)
+  d_i = np.sum(a, axis=1)
+  d = np.diag(d_i)
+  # Print second and third smallest eigenvector
+  _, eigenvectors = eigh(d - a, d, subset_by_index=[1, 2])
+  eigenvec = np.copy(eigenvectors[:, 0])
+  # Using average point to compute bipartition
+  second_smallest_vec = eigenvectors[:, 0]
+  avg = np.sum(second_smallest_vec) / len(second_smallest_vec)
+  bipartition = second_smallest_vec > avg
+  seed = np.argmax(np.abs(second_smallest_vec))
+  if bipartition[seed] != 1:
+    eigenvec = eigenvec * -1
+    bipartition = np.logical_not(bipartition)
+  bipartition = bipartition.reshape(dims).astype(float)
+  # predict BBox
+  # We only extract the principal object BBox
+  pred, _, objects, cc = detect_box(
+      bipartition,
+      seed,
+      dims,
+      scales=scales,
+      initial_im_size=init_image_size[1:],
+  )
+  mask = np.zeros(dims)
+  mask[cc[0], cc[1]] = 1
+  return np.asarray(pred), objects, mask, seed, None, eigenvec.reshape(dims)
+def grad_obj_discover_on_attn(attn, gradcam, dims, topk=1, threshold=0.6):
+  """Get the gradcam and attn map, then find the seed, then use LOST algorithm to find the potential points.
+  Args:
+      attn: attention map from ViT averaged across all heads, shape: [1,
+        (1+num_patches), (1+num_patches)].
+      gradcam: gradcam map from ViT, shape: [1, 1, H, W].
+      dims:
+      topk:
+      threshold:
+  Returns:
+      th_attn:
+  """
+  w_featmap, h_featmap = dims
+  # nh = attn.shape[1]
+  attn = attn.squeeze()
+  seeds = torch.argsort(gradcam.flatten(), descending=True)[:topk]
+  # We keep only the output patch attention
+  # Get the attentions corresponding to [CLS] token
+  patch_attn = attn[1:, 1:]
+  topk_attn = patch_attn[seeds]
+  nh = topk_attn.shape[0]
+  # attentions = attn[0, :, 0, 1:].reshape(nh, -1)
+  # we keep only a certain percentage of the mass
+  val, idx = torch.sort(topk_attn)
+  val /= torch.sum(val, dim=1, keepdim=True)
+  cumval = torch.cumsum(val, dim=1)
+  th_attn = cumval > (1 - threshold)
+  idx2 = torch.argsort(idx)
+  for h in range(nh):
+    th_attn[h] = th_attn[h][idx2[h]]
+  th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float()
+  th_attn = th_attn.sum(0)
+  th_attn[th_attn > 1] = 1
+  return th_attn[None, None]
+def grad_obj_discover(feats, gradcam, dims):
+  """Using gradient heatmap to find the seed, then use LOST algorithm to find the potential points.
+  Args:
+      feats: the pixel/patche features of an image. Shape: [1, HW, C]
+      gradcam: the grad cam map
+      dims: dimension of the map from which the features are used
+  Returns:
+      pred: box predictions
+      A: binary affinity matrix
+      scores: lowest degree scores for all patches
+      seed: selected patch corresponding to an object
+  """
+  # Compute the similarity
+  a = (feats @ feats.transpose(1, 2)).squeeze()
+  # Compute the inverse degree centrality measure per patch
+  # sorted_patches, scores = patch_scoring(a)
+  # Select the initial seed
+  # seed = sorted_patches[0]
+  seed = gradcam.argmax()
+  mask = a[seed]
+  mask = mask.view(1, 1, *dims)
+  return mask
+def lost(feats, dims, scales, init_image_size, k_patches=100):
+  """Implementation of LOST method.
+  Args:
+      feats: the pixel/patche features of an image. Shape: [1, C, H, W]
+      dims: dimension of the map from which the features are used
+      scales: from image to map scale
+      init_image_size: size of the image
+      k_patches: number of k patches retrieved that are compared to the seed
+          at seed expansion.
+  Returns:
+      pred: box predictions
+      A: binary affinity matrix
+      scores: lowest degree scores for all patches
+      seed: selected patch corresponding to an object
+  """
+  # Compute the similarity
+  feats = feats.flatten(2).transpose(1, 2)
+  a = (feats @ feats.transpose(1, 2)).squeeze()
+  # Compute the inverse degree centrality measure per patch
+  sorted_patches, _ = patch_scoring(a)
+  # Select the initial seed
+  seed = sorted_patches[0]
+  # Seed expansion
+  potentials = sorted_patches[:k_patches]
+  similars = potentials[a[seed, potentials] > 0.0]
+  m = torch.sum(a[similars, :], dim=0)
+  # Box extraction
+  _, _, _, mask = detect_box(
+      m, seed, dims, scales=scales, initial_im_size=init_image_size[1:]
+  )
+  return mask
+  # return np.asarray(bbox), A, scores, seed
+def patch_scoring(m, threshold=0.0):
+  """Patch scoring based on the inverse degree."""
+  # Cloning important
+  a = m.clone()
+  # Zero diagonal
+  a.fill_diagonal_(0)
+  # Make sure symmetric and non nul
+  a[a < 0] = 0
+  # C = A + A.t()
+  # Sort pixels by inverse degree
+  cent = -torch.sum(a > threshold, dim=1).type(torch.float32)
+  sel = torch.argsort(cent, descending=True)
+  return sel, cent
+def detect_box(
+    bipartition,
+    seed,
+    dims,
+    initial_im_size=None,
+    scales=None,
+    principle_object=True,
+):
+  """Extract a box corresponding to the seed patch."""
+  # Among connected components extract from the affinity matrix, select the one
+  # corresponding to the seed patch.
+  # w_featmap, h_featmap = dims
+  objects, _ = ndimage.label(bipartition)
+  cc = objects[np.unravel_index(seed, dims)]
+  if principle_object:
+    mask = np.where(objects == cc)
+    # Add +1 because excluded max
+    ymin, ymax = min(mask[0]), max(mask[0]) + 1
+    xmin, xmax = min(mask[1]), max(mask[1]) + 1
+    # Rescale to image size
+    r_xmin, r_xmax = scales[1] * xmin, scales[1] * xmax
+    r_ymin, r_ymax = scales[0] * ymin, scales[0] * ymax
+    pred = [r_xmin, r_ymin, r_xmax, r_ymax]
+    # Check not out of image size (used when padding)
+    if initial_im_size:
+      pred[2] = min(pred[2], initial_im_size[1])
+      pred[3] = min(pred[3], initial_im_size[0])
+    # Coordinate predictions for the feature space
+    # Axis different then in image space
+    pred_feats = [ymin, xmin, ymax, xmax]
+    return pred, pred_feats, objects, mask
+  else:
+    raise NotImplementedError
+# This function is modified from
+# https://github.com/facebookresearch/dino/blob/main/visualize_attention.py
+# Ref: https://github.com/facebookresearch/dino.
+def dino_seg(attn, dims, patch_size, head=0):
+  """Extraction of boxes based on the DINO segmentation method proposed in DINO."""
+  w_featmap, h_featmap = dims
+  nh = attn.shape[1]
+  official_th = 0.6
+  # We keep only the output patch attention
+  # Get the attentions corresponding to [CLS] token
+  attentions = attn[0, :, 0, 1:].reshape(nh, -1)
+  # we keep only a certain percentage of the mass
+  val, idx = torch.sort(attentions)
+  val /= torch.sum(val, dim=1, keepdim=True)
+  cumval = torch.cumsum(val, dim=1)
+  th_attn = cumval > (1 - official_th)
+  idx2 = torch.argsort(idx)
+  for h in range(nh):
+    th_attn[h] = th_attn[h][idx2[h]]
+  th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float()
+  # Connected components
+  labeled_array, _ = scipy.ndimage.label(th_attn[head].cpu().numpy())
+  # Find the biggest component
+  size_components = [
+      np.sum(labeled_array == c) for c in range(np.max(labeled_array))
+  ]
+  if len(size_components) > 1:
+    # Select the biggest component avoiding component 0 corresponding
+    # to background
+    biggest_component = np.argmax(size_components[1:]) + 1
+  else:
+    # Cases of a single component
+    biggest_component = 0
+  # Mask corresponding to connected component
+  mask = np.where(labeled_array == biggest_component)
+  # Add +1 because excluded max
+  ymin, ymax = min(mask[0]), max(mask[0]) + 1
+  xmin, xmax = min(mask[1]), max(mask[1]) + 1
+  # Rescale to image
+  r_xmin, r_xmax = xmin * patch_size, xmax * patch_size
+  r_ymin, r_ymax = ymin * patch_size, ymax * patch_size
+  pred = [r_xmin, r_ymin, r_xmax, r_ymax]
+  return pred
+def get_feats(feat_out, shape):
+  # Batch size, Number of heads, Number of tokens
+  nb_im, nh, nb_tokens = shape[0:3]
+  qkv = (
+      feat_out["qkv"]
+      .reshape(nb_im, nb_tokens, 3, nh, -1 // nh)
+      .permute(2, 0, 3, 1, 4)
+  )
+  k = qkv[1]
+  k = k.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
+  return k
+def get_instances(masks, return_largest=False):
+  return [
+      get_instances_single(m[None], return_largest=return_largest)
+      for m in masks
+  ]
+def get_instances_single(mask, return_largest=False):
+  """Get the mask of a single instance."""
+  labeled_array, _ = label(mask.cpu().numpy())
+  instances = np.concatenate(
+      [labeled_array == c for c in range(np.max(labeled_array) + 1)], axis=0
+  )
+  if return_largest:
+    size_components = np.sum(instances, axis=(1, 2))
+    if len(size_components) > 1:
+      # Select the biggest component avoiding component 0 corresponding
+      # to background
+      biggest_component = np.argmax(size_components[1:]) + 1
+    else:
+      # Cases of a single component
+      biggest_component = 0
+    # Mask corresponding to connected component
+    return torch.from_numpy(labeled_array == biggest_component).float()
+  return torch.from_numpy(instances[1:]).float()

modeling/post_process/post_process.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Post processing."""
+import torch
+import torch.nn.functional as F
+# pylint: disable=g-bad-import-order
+# pylint: disable=g-importing-member
+from modeling.post_process.object_discovery import get_instances
+from utils.metrics import IoM
+# This should be a abstract function to generate masks for the input image.
+# However, we first hack it due to the time limit.
+def generate_masks_from_sam(
+    image_path, save_path, pipeline, img_sam=None, visualize=True
+):
+  """Generate masks from SAM."""
+  masks, _, mask_list = pipeline.segment_automask(
+      image_path=image_path,
+      visualize=visualize,
+      save_path=save_path,
+      image=img_sam,
+  )
+  mask_tensor = torch.from_numpy(masks)
+  mask_tensor = mask_tensor.float()
+  return mask_tensor, mask_list
+def match_masks(
+    mask_tensor, attn_map, mask_list, iom_thres=0.0, min_pred_threshold=0.2
+):
+  """Match masks with the attention map according to the IoU.
+  Args:
+      mask_tensor: A torch.Tensor for the masks with shape [num_masks, height,
+        width].
+      attn_map: A torch.Tensor for the attention map with shape [1, 1, height,
+        width].
+      mask_list: A list of masks with shape [num_masks, height, width]
+      iom_thres: A float for the threshold to apply to the attention map.
+      min_pred_threshold: The prediction score threshold.
+  Returns:
+      A list of matched_masks with shape [num_masks, height, width],
+      len(matched_masks) = number of captions
+  """
+  predictions = attn_map.squeeze(1).detach()
+  iom = IoM(predictions, mask_tensor, min_pred_threshold=min_pred_threshold)
+  keep_mask = iom > iom_thres
+  # mask_tensor = mask_tensor[keep_mask]
+  new_list = []
+  for mid, m_dict in enumerate(mask_list):
+    if keep_mask[mid]:
+      new_list.append(m_dict)
+  #  if not len(new_list):
+  if not new_list:
+    max_id = torch.argmax(iom)
+    new_list.append(mask_list[max_id])
+  return new_list
+def post_process_mask(attn_masks, pad=None, min_area_ratio=0.15):
+  """Post process attention masks."""
+  if pad is not None:
+    left, top, width, height = pad
+    attn_masks = attn_masks[Ellipsis, top : top + height, left : left + width]
+  else:
+    height = None
+    width = None
+  mask_area = attn_masks.sum(dim=(1, 2))
+  total_area = mask_area.sum()
+  keep_mask = mask_area / total_area > min_area_ratio
+  if torch.sum(keep_mask) == 0:
+    if keep_mask.shape[0] == 0:
+      return torch.zeros(
+          (1, height, width), device=attn_masks.device, dtype=attn_masks.dtype
+      )
+    keep_mask[torch.argmax(mask_area)] = True
+  attn_masks = attn_masks[keep_mask]
+  return attn_masks
+def filter_masks(
+    attn_masks,
+    pad=None,
+    mask_threshold=0.3,
+    min_area_ratio=0.15,
+    return_largest=False,
+    device=None,
+    return_instances=False,
+):
+  """Filter attention mask below the threshold."""
+  attn_masks[attn_masks < mask_threshold] = 0
+  # get_instances will be operated on cpu
+  ins_masks = get_instances(attn_masks, return_largest=return_largest)
+  ins_masks = [post_process_mask(m, pad, min_area_ratio) for m in ins_masks]
+  ins_masks = list(filter(lambda x: x is not None, ins_masks))
+  ins_masks = [m.to(device) for m in ins_masks]
+  if not return_instances:
+    return [torch.any(m, dim=0, keepdim=True).to(m.dtype) for m in ins_masks]
+  return ins_masks
+def post_process(
+    input_array,
+    attn_masks,
+    pad=None,
+    mask_threshold=0.3,
+    return_largest=False,
+    min_area_ratio=0.15,
+    return_instances=False,
+):
+  """post process the input tensor with the attention masks.
+  Args:
+      input_array: A np.ndarray input array to be post processed with shape
+        [width, height, 3, batch_size]
+      attn_masks: A torch.Tensor for the attention masks with shape [1,
+        num_texts, width, height]
+      pad: A list of padding: [pad_left, pad_top, width, height], where
+        pad_left, pad_top and width, height are int values.
+      mask_threshold: The threshold to binarize the mask.
+      return_largest: If true, return the largest connected component.
+      min_area_ratio: Keep the mask if its area is larger than this threshold.
+      return_instances: Whether to return instances or not.
+  Returns:
+      attn_masks: A list of tensors with shape [num_instances, height, width]
+          x num_texts, where len(attn_masks) = num_texts.
+      NOTE: the number_instances for each text (class) may vary.
+      The output is a binary tensor.
+  """
+  if len(attn_masks.shape) == 3:
+    attn_masks = attn_masks[None]
+  img_width, img_height = input_array.shape[:2]
+  attn_masks = F.interpolate(
+      attn_masks, size=(img_height, img_width), mode='bicubic'
+  ).squeeze(0)
+  device = attn_masks.device
+  output_masks = filter_masks(
+      attn_masks,
+      pad=pad,
+      mask_threshold=mask_threshold,
+      min_area_ratio=min_area_ratio,
+      return_largest=return_largest,
+      device=device,
+      return_instances=return_instances,
+  )
+  if pad is not None:
+    left, top, width, height = pad
+    input_array = input_array[top : top + height, left : left + width]
+  return input_array, output_masks

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+tensorflow>=2.14.0
+numpy>=1.16.4
+torch>=2.0.0
+torchvision>=0.15.1

sam/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SAM(Segment Anything Model)."""
+from .sam import *
+from .utils import *

sam/sam.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A pipeline for segmenting objects using the SAM model."""
+# Copyright 2024 The Google Research Authors.
+# This file is based on the SAM (Segment Anything) and HQ-SAM.
+#
+# 		https://github.com/facebookresearch/segment-anything
+# 		https://github.com/SysCV/sam-hq/tree/main
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=all
+# pylint: disable=g-importing-member
+import os
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from sam.utils import show_anns
+from sam.utils import show_box
+from sam.utils import show_mask
+from sam.utils import show_points
+from segment_anything import sam_model_registry
+from segment_anything import SamAutomaticMaskGenerator
+from segment_anything import SamPredictor
+class SAMPipeline:
+  def __init__(
+      self,
+      checkpoint,
+      model_type,
+      device="cuda:0",
+      points_per_side=32,
+      pred_iou_thresh=0.88,
+      stability_score_thresh=0.95,
+      box_nms_thresh=0.7,
+  ):
+    self.checkpoint = checkpoint
+    self.model_type = model_type
+    self.device = device
+    self.sam = sam_model_registry[self.model_type](checkpoint=self.checkpoint)
+    self.sam.to(device=self.device)
+    self.load_mask_generator(
+        points_per_side=points_per_side,
+        pred_iou_thresh=pred_iou_thresh,
+        stability_score_thresh=stability_score_thresh,
+        box_nms_thresh=box_nms_thresh,
+    )
+    # Default Prompt Args
+    self.click_args = {"k": 5, "order": "max", "how_filter": "median"}
+    self.box_args = None
+  def load_sam(self):
+    print("Loading SAM")
+    sam = sam_model_registry[self.model_type](checkpoint=self.checkpoint)
+    sam.to(device=self.device)
+    self.predictor = SamPredictor(sam)
+    print("Loading Done")
+  def load_mask_generator(
+      self,
+      points_per_side,
+      pred_iou_thresh,
+      stability_score_thresh,
+      box_nms_thresh,
+  ):
+    print("Loading SAM")
+    self.mask_generator = SamAutomaticMaskGenerator(
+        model=self.sam,
+        points_per_side=points_per_side,
+        pred_iou_thresh=pred_iou_thresh,
+        stability_score_thresh=stability_score_thresh,
+        box_nms_thresh=box_nms_thresh,
+        crop_n_layers=0,
+        crop_n_points_downscale_factor=1,
+    )
+    print("Loading Done")
+  # segment single object
+  def segment_image_single(
+      self,
+      image_path,
+      input_point=None,
+      input_label=None,
+      input_box=None,
+      input_mask=None,
+      multimask_output=True,
+      visualize=False,
+      save_path=None,
+      fname="",
+      image=None,
+  ):
+    if image is None:
+      image = cv2.imread(image_path)
+      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    self.predictor.set_image(image)
+    masks, scores, logits = self.predictor.predict(
+        point_coords=input_point,
+        point_labels=input_label,
+        box=input_box,
+        mask_input=None,
+        multimask_output=multimask_output,
+    )
+    if visualize:
+      self.visualize(
+          image,
+          masks,
+          scores,
+          save_path,
+          input_point=input_point,
+          input_label=input_label,
+          input_box=input_box,
+          input_mask=input_mask,
+          fname=fname,
+      )
+    return masks, scores, logits
+  def segment_automask(
+      self,
+      image_path,
+      visualize=False,
+      save_path=None,
+      image=None,
+      fname="automask.jpg",
+  ):
+    if image is None:
+      image = cv2.imread(image_path)
+      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    mask_list, bbox_list = [], []
+    masks = self.mask_generator.generate(image)
+    mask_list.extend([mask["segmentation"] for mask in masks])
+    bbox_list.extend([mask["bbox"] for mask in masks])
+    if visualize:
+      self.visualize_automask(image, masks, save_path, fname=fname)
+    masks_arr, bbox_arr = np.array(mask_list), np.array(bbox_list)
+    return masks_arr, bbox_arr, masks
+  def visualize_automask(self, image, masks, save_path, fname="mask.jpg"):
+    if not os.path.exists(save_path):
+      os.makedirs(save_path)
+    plt.figure(figsize=(20, 20))
+    plt.imshow(image)
+    show_anns(masks)
+    plt.axis("off")
+    plt.savefig(os.path.join(save_path, fname))
+  def visualize(
+      self,
+      image,
+      masks,
+      scores,
+      save_path,
+      input_point=None,
+      input_label=None,
+      input_box=None,
+      input_mask=None,
+      fname="",
+  ):
+    for i, (mask, score) in enumerate(zip(masks, scores)):
+      plt.figure(figsize=(10, 10))
+      plt.imshow(image)
+      show_mask(mask, plt.gca())
+      if input_point is not None:
+        show_points(input_point, input_label, plt.gca())
+      if input_box is not None:
+        show_box(input_box, plt.gca())
+      if input_mask is not None:
+        show_mask(input_mask[0], plt.gca(), True)
+      plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
+      plt.axis("off")
+      plt.savefig(os.path.join(save_path, f"{fname}{i}.jpg"))
+    return input_point, input_label, input_box, input_mask

sam/utils.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2024 The Google Research Authors.
+# This file is based on the SAM (Segment Anything) and HQ-SAM.
+#
+# 		https://github.com/facebookresearch/segment-anything
+# 		https://github.com/SysCV/sam-hq/tree/main
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SAM Utilities."""
+# pylint: disable=all
+# pylint: disable=g-importing-member
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.spatial.distance import cdist
+def show_mask(mask, ax, random_color=False):
+  if random_color:
+    color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+  else:
+    color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+  h, w = mask.shape[-2:]
+  mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+  ax.imshow(mask_image)
+def show_points(coords, labels, ax, marker_size=375):
+  pos_points = coords[labels == 1]
+  neg_points = coords[labels == 0]
+  ax.scatter(
+      pos_points[:, 0],
+      pos_points[:, 1],
+      color='green',
+      marker='*',
+      s=marker_size,
+      edgecolor='white',
+      linewidth=1.25,
+  )
+  ax.scatter(
+      neg_points[:, 0],
+      neg_points[:, 1],
+      color='red',
+      marker='*',
+      s=marker_size,
+      edgecolor='white',
+      linewidth=1.25,
+  )
+def show_box(box, ax):
+  x0, y0, x1, y1 = box
+  w, h = x1 - x0, y1 - y0
+  ax.add_patch(
+      plt.Rectangle(
+          (x0, y0), w, h, edgecolor='red', facecolor=(0, 0, 0, 0), lw=2
+      )
+  )
+def show_anns(anns):
+  if len(anns) == 0:
+    return
+  for index, dictionary in enumerate(anns):
+    dictionary['id'] = index
+  sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+  ax = plt.gca()
+  ax.set_autoscale_on(False)
+  # polygons = []
+  # color = []
+  for ann in sorted_anns:
+    m = ann['segmentation']
+    img = np.ones((m.shape[0], m.shape[1], 3))
+    color_mask = np.random.random((1, 3)).tolist()[0]
+    for i in range(3):
+      img[:, :, i] = color_mask[i]
+    ax.imshow(np.dstack((img, m * 0.35)))
+    # Get the centroid of the mask
+    mask_y, mask_x = np.nonzero(m)
+    centroid_x, centroid_y = np.mean(mask_x), np.mean(mask_y)
+    # Display the mask ID
+    mask_id = ann['id']
+    ax.text(
+        centroid_x,
+        centroid_y,
+        str(mask_id),
+        color='black',
+        fontsize=48,
+        weight='bold',
+    )
+# Turn CAM result to SAM prompt
+def aggregate_RGB_channel(activation_mask, how='max'):
+  B, C, H, W = activation_mask.shape
+  if how == 'max':
+    res_activation_mask = np.amax(activation_mask, axis=1, keepdims=True)
+  elif how == 'avr':
+    res_activation_mask = np.mean(activation_mask, axis=1, keepdims=True)
+    res_activation_mask = res_activation_mask.reshape(B, 1, H * W)
+  res_activation_mask = np.squeeze(res_activation_mask, axis=1)
+  return res_activation_mask
+def find_k_points(arr, k, order='max', how_filter='median'):
+  arr = arr.squeeze(0)
+  flat_indices = np.argpartition(arr.flatten(), -k)[-k:]
+  unravel_topk_idx = np.unravel_index(flat_indices, arr.shape)
+  topk_indices = np.array(unravel_topk_idx).transpose()[:, ::-1]
+  # print(topk_indices.shape)
+  if how_filter == 'random':
+    random_rows = np.random.choice(
+        topk_indices.shape[0], size=int(round(k / 16)), replace=False
+    )
+    topk_indices = topk_indices[random_rows]
+  elif how_filter == 'median':
+    distances = cdist(topk_indices, topk_indices)
+    distances = np.sum(distances, axis=1)
+    median_distance = np.median(distances)
+    filtered_idx = [
+        i for i in range(len(distances)) if distances[i] < median_distance
+    ]
+    topk_indices = topk_indices[filtered_idx]
+  return topk_indices
+def max_sum_submatrix(matrix):
+  matrix = np.array(matrix)
+  H, W = matrix.shape
+  # Preprocess cumulative sums for rows
+  matrix[:, 1:] += matrix[:, :-1]
+  max_sum = float('-inf')
+  max_rect = (0, 0, 0, 0)  # (top, left, bottom, right)
+  for left in range(W):
+    for right in range(left, W):
+      # Apply 1D Kadane's algorithm for the current pair of columns
+      column_sum = matrix[:, right] - (matrix[:, left - 1] if left > 0 else 0)
+      max_ending_here = max_so_far = column_sum[0]
+      start, end = 0, 0
+      for i in range(1, H):
+        val = column_sum[i]
+        if max_ending_here > 0:
+          max_ending_here += val
+        else:
+          max_ending_here = val
+          start = i
+        if max_ending_here > max_so_far:
+          max_so_far = max_ending_here
+          end = i
+      if max_so_far > max_sum:
+        max_sum = max_so_far
+        max_rect = (start, left, end, right)
+  return max_sum, max_rect
+def CAM2SAMClick(activation_map, k=5, order='max', how_filter='median'):
+  # activation_map = aggregate_RGB_channel(activation_map)
+  H, W, C = activation_map.shape
+  activation_map = activation_map.reshape((1, 1, H, W))
+  coords = []
+  for nrow in range(activation_map.shape[0]):
+    coord = find_k_points(activation_map[nrow], k, order, how_filter)
+    coords.append(coord)
+  return coords
+def CAM2SAMBox(activation_map):
+  # print(activation_map.shape)
+  # activation_map = aggregate_RGB_channel(activation_map)
+  H, W, C = activation_map.shape
+  activation_map = activation_map.reshape((1, H, W))
+  box_coordinates = []
+  for nrow in range(activation_map.shape[0]):
+    # print(activation_map[nrow].shape)
+    arr = activation_map[nrow]
+    norm_arr = 2 * ((arr - np.min(arr)) / (np.max(arr) - np.min(arr))) - 1
+    # print(norm_arr.shape)
+    _, box_coordinate = max_sum_submatrix(norm_arr)
+    box_coordinates.append(box_coordinate)
+  return box_coordinates
+# Visualize
+def visualize_attention(arr, filename):
+  # Create a figure and axes object
+  fig, ax = plt.subplots()
+  # Display the array as an image
+  im = ax.imshow(arr)
+  # Add a colorbar
+  ax.figure.colorbar(im, ax=ax)
+  # cbar = ax.figure.colorbar(im, ax=ax)
+  # Save the figure as a PNG file
+  fig.savefig(filename)
+# Build config
+def build_sam_config(config_path):
+  with open(config_path, 'r') as infile:
+    config = json.load(infile)
+  sam_checkpoint = config['model']['sam_checkpoint']
+  model_type = config['model']['model_type']
+  return sam_checkpoint, model_type

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

utils/inference_pipeline.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The inference pipeline for the CaR model."""
+import numpy as np
+from PIL import Image
+import torch
+# pylint: disable=g-importing-member
+# pylint: disable=g-bad-import-order
+from modeling.post_process.post_process import generate_masks_from_sam
+from modeling.post_process.post_process import match_masks
+from utils.utils import process_sentence
+from utils.metrics import IoU
+IMAGE_WIDTH = 512
+IMAGE_HEIGHT = 512
+def get_sam_masks(
+    config, image_path, masks, matching_thresh=0.9, img_sam=None, pipeline=None
+):
+  """Generate SAM masks."""
+  print("generating sam masks online")
+  mask_tensor, mask_list = generate_masks_from_sam(
+      image_path,
+      save_path="./",
+      pipeline=pipeline,
+      img_sam=img_sam,
+      visualize=False,
+  )
+  mask_tensor = mask_tensor.to(masks.device)
+  # only conduct sam on masks that is not all zero
+  attn_map, mask_ids = [], []
+  for mask_id, mask in enumerate(masks):
+    if torch.sum(mask) > 0:
+      attn_map.append(mask.unsqueeze(0))
+      mask_ids.append(mask_id)
+  matched_masks = [
+      match_masks(
+          mask_tensor,
+          attn,
+          mask_list,
+          iom_thres=config.car.iom_thres,
+          min_pred_threshold=config.sam.min_pred_threshold,
+      )
+      for attn in attn_map
+  ]
+  for matched_mask, mask_id in zip(matched_masks, mask_ids):
+    sam_masks = np.array([item["segmentation"] for item in matched_mask])
+    sam_mask = np.any(sam_masks, axis=0)
+    cur_mask = masks[mask_id]
+    iou = IoU(torch.from_numpy(sam_mask).to(cur_mask.device), cur_mask)
+    if iou > matching_thresh:
+      masks[mask_id] = torch.from_numpy(sam_mask).to(masks.device)
+  return masks
+def inference_car(cfg, car_model, image_path, sentences, sam_pipeline=None):
+  sentences = [process_sentence(sen, cfg.test.ds_name) for sen in sentences]
+  img = Image.open(image_path).convert("RGB")
+  if cfg.test.use_pseudo:
+    masks, scores = car_model(img, sentences)
+    return masks, scores
+  masks, scores = car_model(img, sentences, cfg.car.num_iteration)
+  sam_masks = get_sam_masks(
+      cfg, image_path, masks, cfg.sam.matching_thresh, pipeline=sam_pipeline
+  )
+  return sam_masks, scores

utils/merge_mask.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask merging functions for post-processing."""
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+def merge_masks_simple(
+    all_masks, target_h, target_w, threshold=0.5, scores=None
+):
+  """Merge masks."""
+  merged_mask = None
+  if scores is not None:
+    merged_mask = torch.sum(all_masks * scores[:, None, None], dim=0)
+    merged_mask /= torch.sum(scores)
+  merged_mask = merged_mask.detach().cpu().numpy()
+  # resize the mask to the target size
+  merged_mask = cv2.resize(merged_mask, (target_w, target_h))
+  merged_mask = np.where(merged_mask >= threshold, 1, 0).astype(np.uint8)
+  if np.sum(merged_mask) <= 0.05 * (target_h * target_w):
+    merged_mask = torch.any(all_masks > 0, dim=0)
+    merged_mask = merged_mask.detach().cpu().numpy().astype(np.uint8)
+    # resize the mask to the target size
+    merged_mask = cv2.resize(merged_mask, (target_w, target_h))
+    merged_mask = merged_mask > threshold
+  merged_mask = torch.from_numpy(merged_mask).float()
+  return merged_mask[None]
+def merge_masks(all_masks, target_h, target_w, threshold=0.5):
+  all_masks = torch.from_numpy(np.stack(all_masks)).float()
+  mask_tensor = F.interpolate(
+      all_masks[None], size=(target_h, target_w), mode='bilinear'
+  ).squeeze(0)
+  bg_mask = threshold * torch.ones((1, target_h, target_w))
+  merged_mask = torch.cat([bg_mask, mask_tensor], dim=0)
+  mask_idx = torch.argmax(merged_mask, dim=0)
+  merged_mask = mask_idx > 0
+  if merged_mask.sum() <= 0.05 * (target_h * target_w):
+    merged_mask = torch.any(mask_tensor, dim=0)
+  return merged_mask.float()[None]

utils/metrics.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Metrics for evaluating the performance of the model."""
+import torch
+def IoU(mask1, mask2, threshold=0.5):
+  """Calculate Intersection over Union (IoU) between prediction and GT masks.
+  Args:
+      mask1: A torch.Tensor denoting the prediction, shape (N, H, W), where N is
+        the number of masks.
+      mask2: A torch.Tensor denoting the ground truth, shape (N, H, W), where N
+        is the number of masks.
+      threshold: The threshold to binarize masks.
+  Returns:
+      IoU of `mask1` and `mask2`.
+  """
+  if threshold > 0:
+    mask1, mask2 = (mask1 > threshold).to(torch.bool), (mask2 > threshold).to(
+        torch.bool
+    )
+  intersection = torch.sum(mask1 * (mask1 == mask2), dim=[-1, -2]).squeeze()
+  union = torch.sum(mask1 + mask2, dim=[-1, -2]).squeeze()
+  if union.sum() == 0:
+    return 0
+  return (intersection.to(torch.float) / union).mean().item()
+def IoM(pred, target, min_pred_threshold=0.2):
+  """Calculate Intersection over the area of gt Mask and pred Mask (IoM).
+  between prediction and each ground truth masks.
+  Precaution:
+      this function works for prediction and target that are binary masks,
+      where 1 represents the mask and 0 represents the background.
+  Args:
+      pred: A torch.Tensor denoting the prediction, shape (N, H, W), where N is
+        the number of masks.
+      target: A torch.Tensor denoting the ground truth, shape (N, H, W), where N
+        is the number of masks.
+      min_pred_threshold: prediction threshold.
+  Returns:
+      ious: A torch.Tensor denoting the IoU, shape (N,).
+  """
+  # calculate the intersection over all masks
+  intersection = torch.einsum("mij,nij->mn", pred.to(target.device), target)
+  area_pred = torch.einsum("mij->m", pred)
+  area_target = torch.einsum("nij->n", target)
+  # we calculate the IoM by dividing the intersection over the minimum area.
+  iom_target = torch.einsum("mn,n->mn", intersection, 1 / area_target)
+  iom_pred = torch.einsum("mn,m->mn", intersection, 1 / area_pred)
+  # if the intersection is smaller than a certain percentage of the area of
+  # the pred mask, we consider it as background.
+  iom_target[iom_pred < min_pred_threshold] = 0
+  # we consider the IoM as the maximum IoM between the pred mask and
+  # the target mask.
+  iom = torch.max(iom_target, iom_pred)
+  iom = iom.max(dim=0)[0]
+  return iom

utils/nlp.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Language processing utilities."""
+import spacy
+def load_spacy_model(model='en_core_web_trf'):
+  nlp = spacy.load(model)
+  return nlp
+def process_sentence(sentence, nlp):
+  """Process a sentence."""
+  doc = nlp(sentence)
+  sentence_for_spacy = []
+  for _, token in enumerate(doc):
+    if token.text == ' ':
+      continue
+    sentence_for_spacy.append(token.text)
+  sentence_for_spacy = ' '.join(sentence_for_spacy)
+  noun_phrase, _, _ = extract_noun_phrase(
+      sentence_for_spacy, nlp, need_index=True
+  )
+  return noun_phrase
+def extract_noun_phrase(text, nlp, need_index=False):
+  """Extract noun phrase from text. nlp is a spacy model.
+  Args:
+      text: str, text to be processed.
+      nlp: spacy model.
+      need_index: bool, whether to return the index of the noun phrase.
+  Returns:
+      noun_phrase: str, noun phrase of the text.
+  """
+  # text = text.lower()
+  doc = nlp(text)
+  chunks = {}
+  chunks_index = {}
+  for chunk in doc.noun_chunks:
+    for i in range(chunk.start, chunk.end):
+      chunks[i] = chunk
+      chunks_index[i] = (chunk.start, chunk.end)
+  for token in doc:
+    if token.head.i == token.i:
+      head = token.head
+  if head.i not in chunks:
+    children = list(head.children)
+    if children and children[0].i in chunks:
+      head = children[0]
+    else:
+      if need_index:
+        return text, [], text
+      else:
+        return text
+  head_noun = head.text
+  head_index = chunks_index[head.i]
+  head_index = [i for i in range(head_index[0], head_index[1])]
+  sentence_index = [i for i in range(len(doc))]
+  not_phrase_index = []
+  for i in sentence_index:
+    # not_phrase_index.append(i) if i not in head_index else None
+    if i not in head_index:
+      not_phrase_index.append(i)
+  head = chunks[head.i]
+  if need_index:
+    return head.text, not_phrase_index, head_noun
+  else:
+    return head.text

utils/utils.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for the project."""
+from __future__ import print_function
+# pylint: disable=g-importing-member
+from collections import defaultdict
+from collections import deque
+from copy import deepcopy
+import datetime
+import errno
+import os
+import sys
+import time
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+import yaml
+# pylint: disable=g-bad-import-order
+from data.voc import CLASS2ID
+from data.voc import VOC_CLASSES
+_MB = 1024.0 * 1024.0
+DINO_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+])
+class Config:
+  def __init__(self, **kwargs):
+    for key, value in kwargs.items():
+      if isinstance(value, dict):
+        setattr(self, key, Config(**value))
+      else:
+        setattr(self, key, value)
+def load_yaml(filename):
+  with open(filename) as file:
+    try:
+      data = yaml.safe_load(file)
+      return data
+    except yaml.YAMLError as e:
+      print(f"Error while loading YAML file: {e}")
+def normalize(x, dim=None, eps=1e-15):
+  if dim is None:
+    return (x - x.min()) / (x.max() - x.min())
+  # Normalize to [0, 1].
+  numerator = x - x.min(axis=dim, keepdims=True)[0]
+  denominator = (
+      x.max(axis=dim, keepdims=True)[0]
+      - x.min(axis=dim, keepdims=True)[0]
+      + eps
+  )
+  return numerator / denominator
+class SmoothedValue(object):
+  """Track a series of values and provide access to smoothed values over a window or the global series average."""
+  def __init__(self, window_size=20, fmt=None):
+    if fmt is None:
+      fmt = "{median:.4f} ({global_avg:.4f})"
+    self.deque = deque(maxlen=window_size)
+    self.total = 0.0
+    self.count = 0
+    self.fmt = fmt
+  def update(self, value, n=1):
+    self.deque.append(value)
+    self.count += n
+    self.total += value * n
+  # def synchronize_between_processes(self):
+  #     """
+  #     Warning: does not synchronize the deque!
+  #     """
+  #     if not is_dist_avail_and_initialized():
+  #         return
+  #     t = torch.tensor([self.count, self.total],
+  #                      dtype=torch.float64, device='cuda')
+  #     dist.barrier()
+  #     dist.all_reduce(t)
+  #     t = t.tolist()
+  #     self.count = int(t[0])
+  #     self.total = t[1]
+  @property
+  def median(self):
+    d = torch.tensor(list(self.deque))
+    return d.median().item()
+  @property
+  def avg(self):
+    d = torch.tensor(list(self.deque), dtype=torch.float32)
+    return d.mean().item()
+  @property
+  def global_avg(self):
+    return self.total / self.count
+  @property
+  def max(self):
+    return max(self.deque)
+  @property
+  def value(self):
+    return self.deque[-1]
+  def __str__(self):
+    return self.fmt.format(
+        median=self.median,
+        avg=self.avg,
+        global_avg=self.global_avg,
+        max=self.max,
+        value=self.value,
+    )
+class MetricLogger(object):
+  """Log the metrics."""
+  def __init__(self, delimiter="\t"):
+    self.meters = defaultdict(SmoothedValue)
+    self.delimiter = delimiter
+  def update(self, **kwargs):
+    for k, v in kwargs.items():
+      if isinstance(v, torch.Tensor):
+        v = v.item()
+      assert isinstance(v, (float, int))
+      self.meters[k].update(v)
+  def __getattr__(self, attr):
+    if attr in self.meters:
+      return self.meters[attr]
+    if attr in self.__dict__:
+      return self.__dict__[attr]
+    raise AttributeError(
+        "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+    )
+  def __str__(self):
+    loss_str = []
+    for name, meter in self.meters.items():
+      loss_str.append("{}: {}".format(name, str(meter)))
+    return self.delimiter.join(loss_str)
+  def synchronize_between_processes(self):
+    for meter in self.meters.values():
+      meter.synchronize_between_processes()
+  def add_meter(self, name, meter):
+    self.meters[name] = meter
+  def log_every(self, iterable, print_freq, header=None):
+    """Log every `print_freq` times."""
+    i = 0
+    if not header:
+      header = ""
+    start_time = time.time()
+    end = time.time()
+    iter_time = SmoothedValue(fmt="{avg:.4f}")
+    data_time = SmoothedValue(fmt="{avg:.4f}")
+    space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+    log_msg = self.delimiter.join([
+        header,
+        "[{0" + space_fmt + "}/{1}]",
+        "eta: {eta}",
+        "{meters}",
+        "time: {time}",
+        "data: {data}",
+        "max mem: {memory:.0f}",
+    ])
+    for obj in iterable:
+      data_time.update(time.time() - end)
+      yield obj
+      iter_time.update(time.time() - end)
+      if i % print_freq == 0:
+        eta_seconds = iter_time.global_avg * (len(iterable) - i)
+        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+        print(
+            log_msg.format(
+                i,
+                len(iterable),
+                eta=eta_string,
+                meters=str(self),
+                time=str(iter_time),
+                data=str(data_time),
+                memory=torch.cuda.max_memory_allocated() / _MB,
+            )
+        )
+        sys.stdout.flush()
+      i += 1
+      end = time.time()
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("{} Total time: {}".format(header, total_time_str))
+def mkdir(path):
+  try:
+    os.makedirs(path)
+  except OSError as e:
+    if e.errno != errno.EEXIST:
+      raise
+def pad_to_square(im):
+  """Pad the images to square shape."""
+  im = deepcopy(im)
+  width, height = im.size
+  top_pad = (max(width, height) - height) // 2
+  bot_pad = max(width, height) - height - top_pad
+  left_pad = (max(width, height) - width) // 2
+  right_pad = max(width, height) - width - left_pad
+  if len(im.mode) == 3:
+    color = (0, 0, 0)
+  elif len(im.mode) == 1:
+    color = 0
+  else:
+    raise ValueError(f"Image mode not supported. Image has {im.mode} channels.")
+  return add_margin(im, top_pad, right_pad, bot_pad, left_pad, color=color)
+def add_margin(pil_img, top, right, bottom, left, color=(0, 0, 0)):
+  """Ref: https://note.nkmk.me/en/python-pillow-add-margin-expand-canvas/."""
+  width, height = pil_img.size
+  new_width = width + right + left
+  new_height = height + top + bottom
+  result = Image.new(pil_img.mode, (new_width, new_height), color)
+  result.paste(pil_img, (left, top))
+  # 1 represents the image, 0 represents the padding
+  pad = [left, top, width, height]
+  return result, pad
+def process_sentence(sentence, ds_name):
+  """Dataset specific sentence processing."""
+  if "refcoco" in ds_name:
+    sentence = sentence[0].lower()
+    # get rid of special characters
+    sentence = sentence.replace('"', "")
+    sentence = sentence.replace("/", "")
+  if ds_name == "voc":
+    if sentence in list(CLASS2ID.keys()):
+      label_id = CLASS2ID[sentence] - 1
+      sentence = VOC_CLASSES[label_id]
+  if not isinstance(sentence, str):
+    sentence = sentence[0]
+  return sentence

utils/visualize.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# coding=utf-8
+# Copyright 2024 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Visualization functions."""
+import os
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+import torch
+# pylint: disable=g-importing-member
+from utils.utils import normalize
+_VIS_HEIGHT = 512
+_VIS_WIDTH = 512
+def show_cam_on_image(img, mask):
+  if img.shape[1] != mask.shape[1]:
+    mask = cv2.resize(mask, (img.shape[1], img.shape[0]))
+  heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
+  heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
+  heatmap = np.float32(heatmap) / 255
+  cam = heatmap + np.float32(img)
+  cam = cam / np.max(cam)
+  cam = np.uint8(255 * cam)
+  return cam
+def save_img(array, img_name):
+  numpy_array = array.astype(np.uint8)
+  image = Image.fromarray(numpy_array, mode="RGB")
+  image.save(f"{img_name}.png")
+def viz_attn(img, attn_map, prefix="vis_results/clipcam_img", img_name="cam"):
+  """Visualize attention map."""
+  num_masks = 1
+  if len(attn_map.shape) == 3:
+    num_masks = attn_map.shape[0]
+  attn_map = attn_map.float().squeeze(1).detach().cpu().numpy()
+  attn_map = normalize(attn_map)
+  img = normalize(img)
+  if num_masks == 1:
+    vis = show_cam_on_image(img, attn_map)
+    if not os.path.exists(prefix):
+      os.makedirs(prefix)
+    save_img(vis, os.path.join(prefix, f"{img_name}"))
+    return vis
+  for i in range(num_masks):
+    vis = show_cam_on_image(img, attn_map[i])
+    if not os.path.exists(prefix):
+      os.makedirs(prefix)
+    save_img(vis, os.path.join(prefix, f"{img_name}_{i}"))
+def vis_mask(mask, gt_mask, img, output_dir, fname):
+  """Visualize mask."""
+  mask_img = torch.zeros((_VIS_WIDTH, _VIS_HEIGHT))
+  mask_img[mask[0]] = 1
+  # print(gt_mask.shape, img.size())
+  # Assume img and gt_mask are also torch.Tensor with size (512, 512)
+  img = img[0].permute(1, 2, 0).numpy()
+  gt_mask_img = torch.zeros((_VIS_WIDTH, _VIS_HEIGHT))
+  gt_mask_img[gt_mask[0]] = 1
+  _, axs = plt.subplots(
+      1, 3, figsize=(15, 5)
+  )  # change the figsize if necessary
+  axs[0].imshow(img)  # if image is grayscale, otherwise remove cmap argument
+  axs[0].axis("off")
+  axs[0].set_title("Original Image")
+  axs[1].imshow(
+      mask_img.numpy(), cmap="jet", alpha=0.5
+  )  # using alpha for transparency
+  axs[1].axis("off")
+  axs[1].set_title("Mask")
+  axs[2].imshow(
+      gt_mask_img.numpy(), cmap="jet", alpha=0.5
+  )  # using alpha for transparency
+  axs[2].axis("off")
+  axs[2].set_title("Ground Truth Mask")
+  plt.savefig(
+      os.path.join(output_dir, f"{fname}.jpg"),
+      bbox_inches="tight",
+      dpi=300,
+      pad_inches=0.0,
+  )