Spaces:

sahirp
/

text_2_segment_any_vid

Running on Zero

App Files Files Community

er1t0 commited on Aug 3, 2024

Commit

9b87d5a

1 Parent(s): 7976ee8

shift to ffmpeg

Browse files

Files changed (4) hide show

app.py +108 -98
myapp2.py +204 -0
packages.txt +1 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import torch
 import numpy as np
@@ -10,6 +9,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
 import cv2
 import traceback
 import matplotlib.pyplot as plt
 from utils import load_model_without_flash_attn
@@ -62,7 +62,7 @@ def apply_color_mask(frame, mask, obj_id):
     return frame * (1 - mask) + colored_mask * 255
 def run_florence(image, text_input):
-    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
         task_prompt = '<OPEN_VOCABULARY_DETECTION>'
         prompt = task_prompt + text_input
         inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
@@ -89,125 +89,135 @@ def remove_directory_contents(directory):
         for name in dirs:
             os.rmdir(os.path.join(root, name))
-def process_video(video_path, prompt, chunk_size=30):
     try:
-        video = cv2.VideoCapture(video_path)
-        if not video.isOpened():
-            raise ValueError("Unable to open video file")
-        fps = video.get(cv2.CAP_PROP_FPS)
-        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-        # Process video in chunks
-        all_segmented_frames = []
-        for chunk_start in range(0, frame_count, chunk_size):
-            chunk_end = min(chunk_start + chunk_size, frame_count)
-            frames = []
-            video.set(cv2.CAP_PROP_POS_FRAMES, chunk_start)
-            for _ in range(chunk_end - chunk_start):
-                ret, frame = video.read()
-                if not ret:
-                    break
-                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-            if not frames:
-                print(f"No frames extracted for chunk starting at {chunk_start}")
-                continue
-            # Florence detection on first frame of the chunk
-            first_frame = Image.fromarray(frames[0])
-            mask_box = run_florence(first_frame, prompt)
-            print("Original mask box:", mask_box)
-            # Convert mask_box to numpy array and ensure it's in the correct format
-            mask_box = np.array(mask_box)
-            print("Reshaped mask box:", mask_box)
-            # SAM2 segmentation on first frame
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                image_predictor.set_image(first_frame)
-                masks, _, _ = image_predictor.predict(
-                    point_coords=None,
-                    point_labels=None,
-                    box=mask_box[None, :],
-                    multimask_output=False,
-                )
-            print("masks.shape",masks.shape)
-            mask = masks.squeeze().astype(bool)
-            print("Mask shape:", mask.shape)
-            print("Frame shape:", frames[0].shape)
-            # SAM2 video propagation
-            temp_dir = f"temp_frames_{chunk_start}"
-            os.makedirs(temp_dir, exist_ok=True)
-            for i, frame in enumerate(frames):
-                cv2.imwrite(os.path.join(temp_dir, f"{i:04d}.jpg"), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                inference_state = video_predictor.init_state(video_path=temp_dir)
-                _, _, _ = video_predictor.add_new_mask(
-                    inference_state=inference_state,
-                    frame_idx=0,
-                    obj_id=1,
-                    mask=mask
-                )
-                video_segments = {}
-                for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
-                    video_segments[out_frame_idx] = {
-                        out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-                        for i, out_obj_id in enumerate(out_obj_ids)
-                    }
-            print('segmenting for main vid done')
-            # Apply segmentation masks to frames
-            for i, frame in enumerate(frames):
-                if i in video_segments:
-                    for out_obj_id, mask in video_segments[i].items():
-                        frame = apply_color_mask(frame, mask, out_obj_id)
-                    all_segmented_frames.append(frame.astype(np.uint8))
-                else:
-                    all_segmented_frames.append(frame)
-            # Clean up temporary files
-            remove_directory_contents(temp_dir)
-            os.rmdir(temp_dir)
-        video.release()
-        if not all_segmented_frames:
-            raise ValueError("No frames were processed successfully")
-        # Create video from segmented frames
         output_path = "segmented_video.mp4"
-        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps,
-                              (all_segmented_frames[0].shape[1], all_segmented_frames[0].shape[0]))
         for frame in all_segmented_frames:
-            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
-        out.release()
         return output_path
     except Exception as e:
         print(f"Error in process_video: {str(e)}")
         print(traceback.format_exc())  # This will print the full stack trace
         return None
-def segment_video(video_file, prompt, chunk_size):
     if video_file is None:
         return None
-    output_video = process_video(video_file, prompt, int(chunk_size))
     return output_video
 demo = gr.Interface(
     fn=segment_video,
     inputs=[
         gr.Video(label="Upload Video"),
-        gr.Textbox(label="Enter prompt (e.g., 'a gymnast')"),
-        gr.Slider(minimum=10, maximum=100, step=10, value=30, label="Chunk Size (frames)")
     ],
     outputs=gr.Video(label="Segmented Video"),
     title="Video Object Segmentation with Florence and SAM2",

 import os
 import torch
 import numpy as np
 import cv2
 import traceback
 import matplotlib.pyplot as plt
+import ffmpeg
 from utils import load_model_without_flash_attn
     return frame * (1 - mask) + colored_mask * 255
 def run_florence(image, text_input):
+    with torch.amp.autocast(dtype=torch.bfloat16):
         task_prompt = '<OPEN_VOCABULARY_DETECTION>'
         prompt = task_prompt + text_input
         inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
         for name in dirs:
             os.rmdir(os.path.join(root, name))
+def process_video(video_path, prompt):
     try:
+        # Get video info
+        probe = ffmpeg.probe(video_path)
+        video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
+        width = int(video_info['width'])
+        height = int(video_info['height'])
+        num_frames = int(video_info['nb_frames'])
+        fps = eval(video_info['r_frame_rate'])
+        print(f"Video info: {width}x{height}, {num_frames} frames, {fps} fps")
+        # Read frames
+        out, _ = (
+            ffmpeg
+            .input(video_path)
+            .output('pipe:', format='rawvideo', pix_fmt='rgb24')
+            .run(capture_stdout=True)
+        )
+        frames = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
+        print(f"Read {len(frames)} frames")
+        # Florence detection on first frame
+        first_frame = Image.fromarray(frames[0])
+        mask_box = run_florence(first_frame, prompt)
+        print("Original mask box:", mask_box)
+        # Convert mask_box to numpy array
+        mask_box = np.array(mask_box)
+        print("Reshaped mask box:", mask_box)
+        # SAM2 segmentation on first frame
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            image_predictor.set_image(first_frame)
+            masks, _, _ = image_predictor.predict(
+                point_coords=None,
+                point_labels=None,
+                box=mask_box[None, :],
+                multimask_output=False,
+            )
+        print("masks.shape", masks.shape)
+        mask = masks.squeeze().astype(bool)
+        print("Mask shape:", mask.shape)
+        print("Frame shape:", frames[0].shape)
+        # SAM2 video propagation
+        temp_dir = "temp_frames"
+        os.makedirs(temp_dir, exist_ok=True)
+        for i, frame in enumerate(frames):
+            Image.fromarray(frame).save(os.path.join(temp_dir, f"{i:04d}.jpg"))
+        print(f"Saved {len(frames)} temporary frames")
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            inference_state = video_predictor.init_state(video_path=temp_dir)
+            _, _, _ = video_predictor.add_new_mask(
+                inference_state=inference_state,
+                frame_idx=0,
+                obj_id=1,
+                mask=mask
+            )
+            video_segments = {}
+            for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+        print('Segmenting for main vid done')
+        print(f"Number of segmented frames: {len(video_segments)}")
+        # Apply segmentation masks to frames
+        all_segmented_frames = []
+        for i, frame in enumerate(frames):
+            if i in video_segments:
+                for out_obj_id, mask in video_segments[i].items():
+                    frame = apply_color_mask(frame, mask, out_obj_id)
+                all_segmented_frames.append(frame.astype(np.uint8))
+            else:
+                all_segmented_frames.append(frame)
+        print(f"Applied masks to {len(all_segmented_frames)} frames")
+        # Clean up temporary files
+        remove_directory_contents(temp_dir)
+        os.rmdir(temp_dir)
+        # Write output video using ffmpeg
         output_path = "segmented_video.mp4"
+        process = (
+            ffmpeg
+            .input('pipe:', format='rawvideo', pix_fmt='rgb24', s=f'{width}x{height}', r=fps)
+            .output(output_path, pix_fmt='yuv420p')
+            .overwrite_output()
+            .run_async(pipe_stdin=True)
+        )
         for frame in all_segmented_frames:
+            process.stdin.write(frame.tobytes())
+        process.stdin.close()
+        process.wait()
+        if not os.path.exists(output_path):
+            raise ValueError(f"Output video file was not created: {output_path}")
+        print(f"Successfully created output video: {output_path}")
         return output_path
     except Exception as e:
         print(f"Error in process_video: {str(e)}")
         print(traceback.format_exc())  # This will print the full stack trace
         return None
+def segment_video(video_file, prompt):
     if video_file is None:
         return None
+    output_video = process_video(video_file, prompt)
     return output_video
 demo = gr.Interface(
     fn=segment_video,
     inputs=[
         gr.Video(label="Upload Video"),
+        gr.Textbox(label="Enter prompt (e.g., 'a gymnast')")
     ],
     outputs=gr.Video(label="Segmented Video"),
     title="Video Object Segmentation with Florence and SAM2",

myapp2.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+from sam2.build_sam import build_sam2_video_predictor, build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+import cv2
+import traceback
+import matplotlib.pyplot as plt
+# CUDA optimizations
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+# Initialize models
+sam2_checkpoint = "../checkpoints/sam2_hiera_large.pt"
+model_cfg = "sam2_hiera_l.yaml"
+video_predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)
+sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")
+image_predictor = SAM2ImagePredictor(sam2_model)
+model_id = 'microsoft/Florence-2-large'
+florence_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.bfloat16).eval().cuda()
+florence_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+def apply_color_mask(frame, mask, obj_id):
+    cmap = plt.get_cmap("tab10")
+    color = np.array(cmap(obj_id % 10)[:3])  # Use modulo 10 to cycle through colors
+    # Ensure mask has the correct shape
+    if mask.ndim == 4:
+        mask = mask.squeeze()  # Remove singleton dimensions
+    if mask.ndim == 3 and mask.shape[0] == 1:
+        mask = mask[0]  # Take the first channel if it's a single-channel 3D array
+    # Reshape mask to match frame dimensions
+    mask = cv2.resize(mask.astype(np.float32), (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_LINEAR)
+    # Expand dimensions of mask and color for broadcasting
+    mask = np.expand_dims(mask, axis=2)
+    color = color.reshape(1, 1, 3)
+    colored_mask = mask * color
+    return frame * (1 - mask) + colored_mask * 255
+def run_florence(image, text_input):
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+        task_prompt = '<OPEN_VOCABULARY_DETECTION>'
+        prompt = task_prompt + text_input
+        inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
+        generated_ids = florence_model.generate(
+            input_ids=inputs["input_ids"].cuda(),
+            pixel_values=inputs["pixel_values"].cuda(),
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        parsed_answer = florence_processor.post_process_generation(
+            generated_text,
+            task=task_prompt,
+            image_size=(image.width, image.height)
+        )
+    return parsed_answer[task_prompt]['bboxes'][0]
+def remove_directory_contents(directory):
+    for root, dirs, files in os.walk(directory, topdown=False):
+        for name in files:
+            os.remove(os.path.join(root, name))
+        for name in dirs:
+            os.rmdir(os.path.join(root, name))
+def process_video(video_path, prompt, chunk_size=30):
+    try:
+        video = cv2.VideoCapture(video_path)
+        if not video.isOpened():
+            raise ValueError("Unable to open video file")
+        fps = video.get(cv2.CAP_PROP_FPS)
+        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Process video in chunks
+        all_segmented_frames = []
+        for chunk_start in range(0, frame_count, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, frame_count)
+            frames = []
+            video.set(cv2.CAP_PROP_POS_FRAMES, chunk_start)
+            for _ in range(chunk_end - chunk_start):
+                ret, frame = video.read()
+                if not ret:
+                    break
+                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            if not frames:
+                print(f"No frames extracted for chunk starting at {chunk_start}")
+                continue
+            # Florence detection on first frame of the chunk
+            first_frame = Image.fromarray(frames[0])
+            mask_box = run_florence(first_frame, prompt)
+            print("Original mask box:", mask_box)
+            # Convert mask_box to numpy array and ensure it's in the correct format
+            mask_box = np.array(mask_box)
+            print("Reshaped mask box:", mask_box)
+            # SAM2 segmentation on first frame
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                image_predictor.set_image(first_frame)
+                masks, _, _ = image_predictor.predict(
+                    point_coords=None,
+                    point_labels=None,
+                    box=mask_box[None, :],
+                    multimask_output=False,
+                )
+            print("masks.shape",masks.shape)
+            mask = masks.squeeze().astype(bool)
+            print("Mask shape:", mask.shape)
+            print("Frame shape:", frames[0].shape)
+            # SAM2 video propagation
+            temp_dir = f"temp_frames_{chunk_start}"
+            os.makedirs(temp_dir, exist_ok=True)
+            for i, frame in enumerate(frames):
+                cv2.imwrite(os.path.join(temp_dir, f"{i:04d}.jpg"), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                inference_state = video_predictor.init_state(video_path=temp_dir)
+                _, _, _ = video_predictor.add_new_mask(
+                    inference_state=inference_state,
+                    frame_idx=0,
+                    obj_id=1,
+                    mask=mask
+                )
+                video_segments = {}
+                for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+                    video_segments[out_frame_idx] = {
+                        out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                        for i, out_obj_id in enumerate(out_obj_ids)
+                    }
+            print('segmenting for main vid done')
+            # Apply segmentation masks to frames
+            for i, frame in enumerate(frames):
+                if i in video_segments:
+                    for out_obj_id, mask in video_segments[i].items():
+                        frame = apply_color_mask(frame, mask, out_obj_id)
+                    all_segmented_frames.append(frame.astype(np.uint8))
+                else:
+                    all_segmented_frames.append(frame)
+            # Clean up temporary files
+            remove_directory_contents(temp_dir)
+            os.rmdir(temp_dir)
+        video.release()
+        if not all_segmented_frames:
+            raise ValueError("No frames were processed successfully")
+        # Create video from segmented frames
+        output_path = "segmented_video.mp4"
+        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps,
+                              (all_segmented_frames[0].shape[1], all_segmented_frames[0].shape[0]))
+        for frame in all_segmented_frames:
+            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+        out.release()
+        return output_path
+    except Exception as e:
+        print(f"Error in process_video: {str(e)}")
+        print(traceback.format_exc())  # This will print the full stack trace
+        return None
+def segment_video(video_file, prompt, chunk_size):
+    if video_file is None:
+        return None
+    output_video = process_video(video_file, prompt, int(chunk_size))
+    return output_video
+demo = gr.Interface(
+    fn=segment_video,
+    inputs=[
+        gr.Video(label="Upload Video"),
+        gr.Textbox(label="Enter prompt (e.g., 'a gymnast')"),
+        gr.Slider(minimum=10, maximum=100, step=10, value=30, label="Chunk Size (frames)")
+    ],
+    outputs=gr.Video(label="Segmented Video"),
+    title="Video Object Segmentation with Florence and SAM2",
+    description="Upload a video and provide a text prompt to segment a specific object throughout the video."
+)
+demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ opencv-python
 matplotlib
 einops
 timm
-pytest

 matplotlib
 einops
 timm
+pytest
+ffmpeg-python