Spaces:

EduardoPacheco
/

Dinov2-Video

Running

App Files Files Community

EduardoPacheco commited on Nov 10, 2023

Commit

ce78b5d

•

1 Parent(s): 0293c20

First commit

Browse files

Files changed (5) hide show

.gitattributes +1 -0
.gitignore +1 -0
app.py +65 -0
assets/dog-running.mp4 +3 -0
utils.py +141 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/dog-running.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import gradio as gr
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
+import utils
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModel.from_pretrained('facebook/dinov2-base')
+model.to(device);
+def app_fn(
+    source_video: str,
+    batch_size: int,
+    threshold: float,
+    n_patches: int,
+    is_larger: bool,
+    interpolate: bool,
+) -> str:
+    frames = utils.load_video_frames(source_video)
+    processed_frames = utils.process_video(
+        model=model,
+        video=frames,
+        batch_size=batch_size,
+        threshold=threshold,
+        n_patches=n_patches,
+        is_larger=is_larger,
+        interpolate=interpolate,
+        device=device
+    )
+    output_video = utils.create_video_from_frames_rgb(processed_frames)
+    return output_video
+if __name__ == "__main__":
+    title = "🦖 DINOv2 Video 🦖"
+    with gr.Blocks() as demo:
+        with gr.Row():
+            source_video = gr.Video(label="Input Video", sources="upload", format="mp4")
+            output_video = gr.Video(label="Output Video")
+        with gr.Row():
+            batch_size = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Batch Size")
+            threshold = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Threshold")
+            n_patches = gr.Slider(minimum=20, maximum=40, step=1, value=30, label="Number of Patches")
+            is_larger = gr.Checkbox(label="Is Larger", value=True)
+            interpolate = gr.Checkbox(label="Interpolate", value=False)
+        btn = gr.Button("Process Video")
+        btn.click(
+            fn=app_fn,
+            inputs=[source_video, batch_size, threshold, n_patches, is_larger, interpolate],
+            outputs=[output_video]
+        )
+        examples = gr.Examples(
+            examples=[
+                ["assets/dog-running.mp4", 30, 0.5, 40, True, False],
+            ],
+            inputs=[source_video, batch_size, threshold, n_patches, is_larger, interpolate],
+            outputs=[output_video],
+            fn=app_fn,
+            cache_examples=True
+        )
+    demo.queue(max_size=5).launc()

assets/dog-running.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b36eaefc8b224d27f262f37d092f1438a2bbe6a997bb0077889a7cb5ab9911eb
+size 3446481

utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from typing import List
+import cv2
+import torch
+import numpy as np
+from tqdm import tqdm
+import supervision as sv
+import torch.nn.functional as F
+from transformers import AutoModel
+from sklearn.decomposition import PCA
+from torchvision import transforms as T
+from sklearn.preprocessing import MinMaxScaler
+def load_video_frames(video_path: str) -> List[np.ndarray]:
+    frames = []
+    for frame in tqdm(sv.get_video_frames_generator(source_path=video_path), unit=" frames"):
+        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    return frames
+def preprocess(image: np.ndarray, n_patches: int, device: str, patch_size: int = 14) -> torch.Tensor:
+    IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+    transform = T.Compose([
+        T.Resize((n_patches * patch_size, n_patches * patch_size)),
+        T.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+    ])
+    img = torch.from_numpy(image).type(torch.float).permute(2, 0, 1) / 255
+    img_tensor = transform(img).unsqueeze(0).to(device)
+    return img_tensor
+def process_video(
+    model: AutoModel,
+    video: str | List[np.ndarray],
+    is_larger: bool = True,
+    batch_size: int = 4,
+    threshold: float = 0.5,
+    n_patches: int = 40,
+    interpolate: bool = False,
+    device: str = "cpu"
+) -> List[np.ndarray]:
+    # NP = N_PATCHES
+    # P = PATCH_SIZE
+    if isinstance(video, str):
+        frames = load_video_frames(video)
+    else:
+        frames = video
+    patch_size = model.config.patch_size
+    original_height = frames[0].shape[0] # C, H, W
+    original_width = frames[0].shape[1] # C, H, W
+    final_frames = []
+    pca = PCA(n_components=3)
+    scaler = MinMaxScaler(clip=True)
+    for i in range(len(frames)//batch_size):
+        batch = frames[i*batch_size:batch_size*(i+1)]
+        pixel_values = [
+            preprocess(f, n_patches, device, patch_size).squeeze(0) for f in batch
+        ]
+        pixel_values = torch.stack(pixel_values) # B, C, NP * P, NP * P
+        with torch.no_grad():
+            out = model(pixel_values=pixel_values)
+        features = out.last_hidden_state[:, 1:] # B, P * P, HIDDEN_DIM
+        features = features.cpu().numpy()
+        features = features.reshape(batch_size * n_patches * n_patches, -1) # B * P * P, HIDDEN_DIM
+        pca_features = pca.fit_transform(features)
+        pca_features = scaler.fit_transform(pca_features)
+        if is_larger:
+            pca_features_bg = pca_features[:, 0] > threshold
+        else:
+            pca_features_bg = pca_features[:, 0] < threshold
+        pca_features_fg = ~pca_features_bg
+        pca_features_fg_seg = pca.fit_transform(features[pca_features_fg])
+        pca_features_fg_seg = scaler.fit_transform(pca_features_fg_seg)
+        pca_features_rgb = np.zeros((batch_size * n_patches * n_patches, 3))
+        pca_features_rgb[pca_features_bg] = 0
+        pca_features_rgb[pca_features_fg] = pca_features_fg_seg
+        pca_features_rgb = pca_features_rgb.reshape(batch_size, n_patches, n_patches, 3)
+        if interpolate:
+            # transformed into torch tensor
+            pca_features_rgb = torch.from_numpy(pca_features_rgb) # B, P, P, 3
+            # reshaped to B, C, P, P
+            pca_features_rgb = pca_features_rgb.permute(0, 3, 1, 2)
+            # interpolate to B, C, H, W
+            # reshaped to B, H, W, C
+            # unbind to a list of len B with np.ndarray of shape H, W, C
+            pca_features_rgb = F.interpolate(
+                pca_features_rgb,
+                size=(original_height, original_width),
+                mode='bilinear',
+                align_corners=False
+            ).permute(0, 2, 3, 1).unbind(0)
+            # Fixing range to np.uint8
+        else:
+            pca_features_rgb = [f for f in pca_features_rgb]
+        # Adding to final_frames list
+        final_frames.extend(pca_features_rgb)
+    return final_frames
+def create_video_from_frames_rgb(
+    frame_list: List[np.ndarray],
+    output_filename: str = "animation.mp4",
+    fps: int = 15
+) -> str:
+    # Get the shape of the frames to determine video dimensions
+    frame_height, frame_width, _ = frame_list[0].shape
+    # Define the codec and create a VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can change the codec as needed
+    out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))
+    for frame in frame_list:
+        # Convert the frame from RGB to BGR
+        bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        # Write the frame to the video file
+        out.write(bgr_frame)
+    # Release the VideoWriter object
+    out.release()
+    return output_filename