Spaces:

dennistrujillo
/

MedSAMTest

Sleeping

App Files Files Community

dennistrujillo commited on Jan 28, 2024

Commit

201e3ec

verified ·

1 Parent(s): 1c3d5f7

Integrated MedSAM Model with Full Pipeline for Image Segmentation

Browse files

Files changed (1) hide show

app.py +47 -38

app.py CHANGED Viewed

@@ -24,34 +24,35 @@ def load_image(file_path):
     H, W = img.shape[:2]
     return img, H, W
-# MedSAM inference function
-def medsam_inference(medsam_model, img, box, H, W, target_size, device):
-    # Assuming the model expects 1024x1024 input
-    expected_model_input_size = 1024
-    # Resize image to expected model input size
-    img_resized = transform.resize(img, (expected_model_input_size, expected_model_input_size), anti_aliasing=True)
-    # Ensure the image is in the correct shape (H, W, C)
-    if len(img_resized.shape) == 3 and img_resized.shape[2] == 3:
-        # Convert to PyTorch tensor and add batch dimension
-        img_tensor = torch.from_numpy(img_resized.transpose((2, 0, 1))).float().unsqueeze(0).to(device)
-    else:
-        raise ValueError("Image must be a 3-channel (RGB) image")
-    box_resized = np.array(box) * (target_size / np.array([W, H, W, H]))
-    # Model expects box in format (x0, y0, x1, y1)
-    box_tensor = torch.tensor(box_resized, dtype=torch.float32).unsqueeze(0).to(device)  # Add batch dimension
-    # MedSAM inference
-    img_embed = medsam_model.image_encoder(img_tensor)
-    mask = medsam_model.predict(img_embed, box_tensor)
-    # Post-process mask: resize back to original size
-    mask_resized = transform.resize(mask[0].cpu().numpy(), (H, W))
-    return mask_resized
 # Function for visualizing images with masks
 def visualize(image, mask, box):
@@ -67,11 +68,13 @@ def visualize(image, mask, box):
     buf.seek(0)
     return buf
-# Main function for Gradio app
 # Main function for Gradio app
 def process_images(file, x_min, y_min, x_max, y_max):
     image, H, W = load_image(file)
     # Check if CUDA is available, and set the device accordingly
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -80,18 +83,24 @@ def process_images(file, x_min, y_min, x_max, y_max):
     # Create the model instance and load the checkpoint
     medsam_model = sam_model_registry['vit_b'](checkpoint=model_checkpoint_path)
-    # If running on CPU, map the model to CPU
-    if device == 'cpu':
-        medsam_model = medsam_model.to(torch.device('cpu'))
-    medsam_model.eval()
-    box = [x_min, y_min, x_max, y_max]
-    mask = medsam_inference(medsam_model, image, box, H, W, H, device)  # Pass device to the inference function
-    visualization = visualize(image, mask, box)
-    return visualization.getvalue() # Returning the byte stream
 # Set up Gradio interface
 iface = gr.Interface(

     H, W = img.shape[:2]
     return img, H, W
+def medsam_inference(medsam_model, img_embed, box_1024, H, W):
+    box_torch = torch.as_tensor(box_1024, dtype=torch.float, device=img_embed.device)
+    if len(box_torch.shape) == 2:
+        box_torch = box_torch[:, None, :] # (B, 1, 4)
+    sparse_embeddings, dense_embeddings = medsam_model.prompt_encoder(
+        points=None,
+        boxes=box_torch,
+        masks=None,
+    )
+    low_res_logits, _ = medsam_model.mask_decoder(
+        image_embeddings=img_embed, # (B, 256, 64, 64)
+        image_pe=medsam_model.prompt_encoder.get_dense_pe(), # (1, 256, 64, 64)
+        sparse_prompt_embeddings=sparse_embeddings, # (B, 2, 256)
+        dense_prompt_embeddings=dense_embeddings, # (B, 256, 64, 64)
+        multimask_output=False,
+        )
+    low_res_pred = torch.sigmoid(low_res_logits)  # (1, 1, 256, 256)
+    low_res_pred = F.interpolate(
+        low_res_pred,
+        size=(H, W),
+        mode="bilinear",
+        align_corners=False,
+    )  # (1, 1, gt.shape)
+    low_res_pred = low_res_pred.squeeze().cpu().numpy()  # (256, 256)
+    medsam_seg = (low_res_pred > 0.5).astype(np.uint8)
+    return medsam_seg
 # Function for visualizing images with masks
 def visualize(image, mask, box):
     buf.seek(0)
     return buf
 # Main function for Gradio app
 def process_images(file, x_min, y_min, x_max, y_max):
+    # Load and preprocess image
     image, H, W = load_image(file)
+    image_resized = transform.resize(image, (1024, 1024), anti_aliasing=True)
+    image_resized = (image_resized - image_resized.min()) / np.clip(image_resized.max() - image_resized.min(), a_min=1e-8, a_max=None)
     # Check if CUDA is available, and set the device accordingly
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     # Create the model instance and load the checkpoint
     medsam_model = sam_model_registry['vit_b'](checkpoint=model_checkpoint_path)
+    medsam_model = medsam_model.to(device)
+    medsam_model.eval()
+    # Convert image to tensor and move to the correct device
+    image_tensor = torch.tensor(image_resized).float().permute(2, 0, 1).unsqueeze(0).to(device)
+    # Generate image embedding
+    with torch.no_grad():
+        img_embed = medsam_model.image_encoder(image_tensor)
+    # Calculate resized box coordinates and perform inference
+    scale_factors = np.array([1024 / W, 1024 / H, 1024 / W, 1024 / H])
+    box_1024 = np.array([x_min, y_min, x_max, y_max]) * scale_factors
+    mask = medsam_inference(medsam_model, img_embed, box_1024, H, W)
+    visualization = visualize(image, mask, [x_min, y_min, x_max, y_max])
+    return visualization.getvalue()
 # Set up Gradio interface
 iface = gr.Interface(