Spaces:

sagar007
/

SegmentVision

Runtime error

App Files Files Community

sagar007 commited on Jul 26, 2024

Commit

26c0f04

verified ·

1 Parent(s): 3cd1243

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -7

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import cv2
 import numpy as np
 from transformers import SamModel, SamProcessor, BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
 # Set up device
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -26,6 +27,26 @@ def process_mask(mask, target_size):
     mask_image = mask_image.resize(target_size, Image.NEAREST)
     return np.array(mask_image) > 0
 def segment_image(input_image, object_name):
     try:
         if input_image is None:
@@ -36,9 +57,9 @@ def segment_image(input_image, object_name):
         if not original_size or 0 in original_size:
             return None, "Invalid image size. Please upload a different image."
-        # Generate image caption
         blip_inputs = blip_processor(input_image, return_tensors="pt").to(device)
-        caption = blip_model.generate(**blip_inputs)
         caption_text = blip_processor.decode(caption[0], skip_special_tokens=True)
         # Process the image with SAM
@@ -58,15 +79,21 @@ def segment_image(input_image, object_name):
         # Find the mask that best matches the specified object
         best_mask = None
         best_score = -1
         for mask in masks[0]:
             mask_binary = mask.numpy() > 0.5
-            mask_area = mask_binary.sum()
-            if object_name.lower() in caption_text.lower() and mask_area > best_score:
-                best_mask = mask_binary
-                best_score = mask_area
         if best_mask is None:
-            return input_image, f"Could not find '{object_name}' in the image."
         combined_mask = process_mask(best_mask, original_size)

 import numpy as np
 from transformers import SamModel, SamProcessor, BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
+from scipy.ndimage import label, center_of_mass
 # Set up device
 device = "cuda" if torch.cuda.is_available() else "cpu"
     mask_image = mask_image.resize(target_size, Image.NEAREST)
     return np.array(mask_image) > 0
+def is_cat_like(mask, image_area):
+    labeled, num_features = label(mask)
+    if num_features == 0:
+        return False
+    largest_component = (labeled == (np.bincount(labeled.flatten())[1:].argmax() + 1))
+    area = largest_component.sum()
+    # Check if the area is reasonable for a cat (between 5% and 30% of image)
+    if not (0.05 * image_area < area < 0.3 * image_area):
+        return False
+    # Check if the shape is roughly elliptical
+    cy, cx = center_of_mass(largest_component)
+    major_axis = max(largest_component.shape)
+    minor_axis = min(largest_component.shape)
+    aspect_ratio = major_axis / minor_axis
+    return 1.5 < aspect_ratio < 3  # Most cats have an aspect ratio in this range
 def segment_image(input_image, object_name):
     try:
         if input_image is None:
         if not original_size or 0 in original_size:
             return None, "Invalid image size. Please upload a different image."
+        # Generate detailed image caption
         blip_inputs = blip_processor(input_image, return_tensors="pt").to(device)
+        caption = blip_model.generate(**blip_inputs, max_length=50)
         caption_text = blip_processor.decode(caption[0], skip_special_tokens=True)
         # Process the image with SAM
         # Find the mask that best matches the specified object
         best_mask = None
         best_score = -1
+        image_area = original_size[0] * original_size[1]
+        cat_related_words = ['cat', 'kitten', 'feline', 'tabby', 'kitty']
+        caption_contains_cat = any(word in caption_text.lower() for word in cat_related_words)
         for mask in masks[0]:
             mask_binary = mask.numpy() > 0.5
+            if is_cat_like(mask_binary, image_area) and caption_contains_cat:
+                mask_area = mask_binary.sum()
+                if mask_area > best_score:
+                    best_mask = mask_binary
+                    best_score = mask_area
         if best_mask is None:
+            return input_image, f"Could not find a suitable '{object_name}' in the image."
         combined_mask = process_mask(best_mask, original_size)