Spaces:

caltech-animal-tracking
/

Primate-Detection-GPU

Sleeping

App Files Files Community

annading commited on Apr 1, 2024

Commit

adab5b0

1 Parent(s): 2081ef8

updated inference, testing batch size

Browse files

Files changed (2) hide show

app.py +1 -9
dino_sam.py +10 -7

app.py CHANGED Viewed

@@ -7,8 +7,6 @@ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:5000'
 subprocess.run(['pip', 'install', '-e', 'GroundingDINO'])
 sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
 sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
-# os.system("wget https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth")
-# os.system("wget https://huggingface.co/spaces/mrtlive/segment-anything-model/resolve/main/sam_vit_h_4b8939.pth")
 import gradio as gr
 from dino_sam import sam_dino_vid
@@ -43,12 +41,6 @@ with gr.Blocks() as demo:
         """
     )
-    gr.HTML(
-        """
-            <p="left">
-                The csv contains frame numbers and timestamps, bounding box coordinates, and number of detections per frame.</p>
-        """
-    )
     with gr.Row():
         with gr.Column():
             input = gr.Video(label="Input Video", interactive=True)
@@ -74,7 +66,7 @@ with gr.Blocks() as demo:
                     step=1)
                 video_options = gr.CheckboxGroup(choices=["Bounding boxes", "Masks"],
                                                  label="Video Output Options",
-                                                 info="Select the options to display in the output video.",
                                                  value=["Bounding boxes"],
                                                  interactive=True)

 subprocess.run(['pip', 'install', '-e', 'GroundingDINO'])
 sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
 sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
 import gradio as gr
 from dino_sam import sam_dino_vid
         """
     )
     with gr.Row():
         with gr.Column():
             input = gr.Video(label="Input Video", interactive=True)
                     step=1)
                 video_options = gr.CheckboxGroup(choices=["Bounding boxes", "Masks"],
                                                  label="Video Output Options",
+                                                 info="Select the options to display in the output video. Note: if masks are selected, runtime will increase.",
                                                  value=["Bounding boxes"],
                                                  interactive=True)

dino_sam.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch
 import csv
 # import pstats
 import warnings
-# from memory_profiler import profile
 # from pstats import SortKey
 from tqdm import tqdm
 from torchvision.ops import box_convert
@@ -26,6 +26,7 @@ def prepare_image(image, transform, device):
     image = torch.as_tensor(image, device=device.device)
     return image.permute(2, 0, 1).contiguous()
 def sam_dino_vid(
         vid_path: str,
         text_prompt: str,
@@ -36,7 +37,7 @@ def sam_dino_vid(
         config_path: str = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
         weights_path: str = "weights/groundingdino_swint_ogc.pth",
         device: str = 'cuda',
-        batch_size: int = 5
         ) -> (str, str):
     """ Args:
         Returns:
@@ -101,13 +102,13 @@ def sam_dino_vid(
             annotated_frame_paths = [os.path.join(frames_dir, os.path.basename(frame_path)) for frame_path in batch_paths]
             # convert images_orig to rgb from bgr
-            images_orig = [cv2.cvtColor(image, cv2.COLOR_BGR2RGB) for image in images_orig]
             if masks_needed:
                 # run SAM in batches on boxes from dino
                 batched_input = []
                 sam_boxes = []
-                for image, box in zip(images_orig, boxes_i):
                     height, width = image.shape[:2]
                     # convert the boxes from groundingDINO format to SAM format
                     box = box * torch.Tensor([width, height, width, height])
@@ -123,7 +124,7 @@ def sam_dino_vid(
                     # write to annotated_frames_dir for stitching
                     mask = prediction["masks"].cpu().numpy()
                     box = sam_boxes[i].cpu().numpy()
-                    annotated_frame = plot_sam(images_orig[i], mask, box, boxes_shown=boxes_needed)
                     cv2.imwrite(annotated_frame_paths[i], annotated_frame)
             elif boxes_needed and not masks_needed:
@@ -215,6 +216,8 @@ def plot_sam(
     return image
 # if __name__ == '__main__':
 #     start_time = datetime.datetime.now()
-#     sam_dino_vid("baboon_15s.mp4", "baboon", box_threshold=0.3, text_threshold=0.3, fps_processed=30, video_options=['Bounding boxes', 'Masks'])
-#     print("elapsed: " + str(datetime.datetime.now() - start_time))

 import csv
 # import pstats
 import warnings
+from memory_profiler import profile
 # from pstats import SortKey
 from tqdm import tqdm
 from torchvision.ops import box_convert
     image = torch.as_tensor(image, device=device.device)
     return image.permute(2, 0, 1).contiguous()
+# @profile
 def sam_dino_vid(
         vid_path: str,
         text_prompt: str,
         config_path: str = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
         weights_path: str = "weights/groundingdino_swint_ogc.pth",
         device: str = 'cuda',
+        batch_size: int = 10
         ) -> (str, str):
     """ Args:
         Returns:
             annotated_frame_paths = [os.path.join(frames_dir, os.path.basename(frame_path)) for frame_path in batch_paths]
             # convert images_orig to rgb from bgr
+            images_orig_rgb = [cv2.cvtColor(image, cv2.COLOR_BGR2RGB) for image in images_orig]
             if masks_needed:
                 # run SAM in batches on boxes from dino
                 batched_input = []
                 sam_boxes = []
+                for image, box in zip(images_orig_rgb, boxes_i):
                     height, width = image.shape[:2]
                     # convert the boxes from groundingDINO format to SAM format
                     box = box * torch.Tensor([width, height, width, height])
                     # write to annotated_frames_dir for stitching
                     mask = prediction["masks"].cpu().numpy()
                     box = sam_boxes[i].cpu().numpy()
+                    annotated_frame = plot_sam(images_orig_rgb[i], mask, box, boxes_shown=boxes_needed)
                     cv2.imwrite(annotated_frame_paths[i], annotated_frame)
             elif boxes_needed and not masks_needed:
     return image
 # if __name__ == '__main__':
+#     def run_sam_dino_vid():
+#         sam_dino_vid("baboon_15s.mp4", "baboon", box_threshold=0.3, text_threshold=0.3, fps_processed=30, video_options=['Bounding boxes', 'Masks'])
 #     start_time = datetime.datetime.now()
+#     stats = run_sam_dino_vid()
+#     print("elapsed: " + str(datetime.datetime.now() - start_time))