|
import data |
|
import cv2 |
|
import torch |
|
from PIL import Image, ImageDraw |
|
from tqdm import tqdm |
|
from models import imagebind_model |
|
from models.imagebind_model import ModalityType |
|
|
|
from segment_anything import build_sam, SamAutomaticMaskGenerator |
|
|
|
from utils import ( |
|
segment_image, |
|
convert_box_xywh_to_xyxy, |
|
get_indices_of_values_above_threshold, |
|
) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
""" |
|
Step 1: Instantiate model |
|
""" |
|
|
|
mask_generator = SamAutomaticMaskGenerator( |
|
build_sam(checkpoint=".checkpoints/sam_vit_h_4b8939.pth").to(device), |
|
points_per_side=16, |
|
) |
|
|
|
|
|
bind_model = imagebind_model.imagebind_huge(pretrained=True) |
|
bind_model.eval() |
|
bind_model.to(device) |
|
|
|
|
|
""" |
|
Step 2: Generate auto masks with SAM |
|
""" |
|
image_path = ".assets/car_image.jpg" |
|
image = cv2.imread(image_path) |
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
|
masks = mask_generator.generate(image) |
|
|
|
|
|
""" |
|
Step 3: Get cropped images based on mask and box |
|
""" |
|
cropped_boxes = [] |
|
image = Image.open(image_path) |
|
for mask in tqdm(masks): |
|
cropped_boxes.append(segment_image(image, mask["segmentation"]).crop(convert_box_xywh_to_xyxy(mask["bbox"]))) |
|
|
|
|
|
""" |
|
Step 4: Run ImageBind model to get similarity between cropped image and different modalities |
|
""" |
|
def retriev_vision_and_audio(elements, audio_list): |
|
inputs = { |
|
ModalityType.VISION: data.load_and_transform_vision_data_from_pil_image(elements, device), |
|
ModalityType.AUDIO: data.load_and_transform_audio_data(audio_list, device), |
|
} |
|
with torch.no_grad(): |
|
embeddings = bind_model(inputs) |
|
vision_audio = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=0), |
|
return vision_audio |
|
|
|
vision_audio_result = retriev_vision_and_audio(cropped_boxes, [".assets/car_audio.wav"]) |
|
|
|
|
|
""" |
|
Step 5: Merge the top similarity masks to get the final mask and save the merged mask |
|
|
|
This is the audio retrival result |
|
""" |
|
|
|
|
|
|
|
threshold = 0.025 |
|
index = get_indices_of_values_above_threshold(vision_audio_result[0], threshold) |
|
|
|
segmentation_masks = [] |
|
for seg_idx in index: |
|
segmentation_mask_image = Image.fromarray(masks[seg_idx]["segmentation"].astype('uint8') * 255) |
|
segmentation_masks.append(segmentation_mask_image) |
|
|
|
original_image = Image.open(image_path) |
|
overlay_image = Image.new('RGBA', image.size, (0, 0, 0, 255)) |
|
overlay_color = (255, 255, 255, 0) |
|
|
|
draw = ImageDraw.Draw(overlay_image) |
|
for segmentation_mask_image in segmentation_masks: |
|
draw.bitmap((0, 0), segmentation_mask_image, fill=overlay_color) |
|
|
|
|
|
mask_image = overlay_image.convert("RGB") |
|
mask_image.save("./audio_sam_merged_mask.jpg") |
|
|
|
|