EduardoPacheco commited on
Commit
ce78b5d
1 Parent(s): 0293c20

First commit

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -0
  3. app.py +65 -0
  4. assets/dog-running.mp4 +3 -0
  5. utils.py +141 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/dog-running.mp4 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
4
+
5
+ import utils
6
+
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+ model = AutoModel.from_pretrained('facebook/dinov2-base')
9
+ model.to(device);
10
+
11
+ def app_fn(
12
+ source_video: str,
13
+ batch_size: int,
14
+ threshold: float,
15
+ n_patches: int,
16
+ is_larger: bool,
17
+ interpolate: bool,
18
+ ) -> str:
19
+ frames = utils.load_video_frames(source_video)
20
+ processed_frames = utils.process_video(
21
+ model=model,
22
+ video=frames,
23
+ batch_size=batch_size,
24
+ threshold=threshold,
25
+ n_patches=n_patches,
26
+ is_larger=is_larger,
27
+ interpolate=interpolate,
28
+ device=device
29
+ )
30
+
31
+ output_video = utils.create_video_from_frames_rgb(processed_frames)
32
+
33
+ return output_video
34
+
35
+ if __name__ == "__main__":
36
+ title = "🦖 DINOv2 Video 🦖"
37
+ with gr.Blocks() as demo:
38
+ with gr.Row():
39
+ source_video = gr.Video(label="Input Video", sources="upload", format="mp4")
40
+ output_video = gr.Video(label="Output Video")
41
+ with gr.Row():
42
+ batch_size = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Batch Size")
43
+ threshold = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Threshold")
44
+ n_patches = gr.Slider(minimum=20, maximum=40, step=1, value=30, label="Number of Patches")
45
+ is_larger = gr.Checkbox(label="Is Larger", value=True)
46
+ interpolate = gr.Checkbox(label="Interpolate", value=False)
47
+
48
+ btn = gr.Button("Process Video")
49
+ btn.click(
50
+ fn=app_fn,
51
+ inputs=[source_video, batch_size, threshold, n_patches, is_larger, interpolate],
52
+ outputs=[output_video]
53
+ )
54
+ examples = gr.Examples(
55
+ examples=[
56
+ ["assets/dog-running.mp4", 30, 0.5, 40, True, False],
57
+ ],
58
+ inputs=[source_video, batch_size, threshold, n_patches, is_larger, interpolate],
59
+ outputs=[output_video],
60
+ fn=app_fn,
61
+ cache_examples=True
62
+ )
63
+
64
+ demo.queue(max_size=5).launc()
65
+
assets/dog-running.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b36eaefc8b224d27f262f37d092f1438a2bbe6a997bb0077889a7cb5ab9911eb
3
+ size 3446481
utils.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import cv2
4
+ import torch
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ import supervision as sv
8
+ import torch.nn.functional as F
9
+ from transformers import AutoModel
10
+ from sklearn.decomposition import PCA
11
+ from torchvision import transforms as T
12
+ from sklearn.preprocessing import MinMaxScaler
13
+
14
+
15
+ def load_video_frames(video_path: str) -> List[np.ndarray]:
16
+ frames = []
17
+ for frame in tqdm(sv.get_video_frames_generator(source_path=video_path), unit=" frames"):
18
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
19
+
20
+ return frames
21
+
22
+ def preprocess(image: np.ndarray, n_patches: int, device: str, patch_size: int = 14) -> torch.Tensor:
23
+ IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
24
+ IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
25
+
26
+ transform = T.Compose([
27
+ T.Resize((n_patches * patch_size, n_patches * patch_size)),
28
+ T.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
29
+ ])
30
+
31
+ img = torch.from_numpy(image).type(torch.float).permute(2, 0, 1) / 255
32
+ img_tensor = transform(img).unsqueeze(0).to(device)
33
+
34
+ return img_tensor
35
+
36
+
37
+ def process_video(
38
+ model: AutoModel,
39
+ video: str | List[np.ndarray],
40
+ is_larger: bool = True,
41
+ batch_size: int = 4,
42
+ threshold: float = 0.5,
43
+ n_patches: int = 40,
44
+ interpolate: bool = False,
45
+ device: str = "cpu"
46
+ ) -> List[np.ndarray]:
47
+ # NP = N_PATCHES
48
+ # P = PATCH_SIZE
49
+ if isinstance(video, str):
50
+ frames = load_video_frames(video)
51
+ else:
52
+ frames = video
53
+ patch_size = model.config.patch_size
54
+
55
+ original_height = frames[0].shape[0] # C, H, W
56
+ original_width = frames[0].shape[1] # C, H, W
57
+
58
+ final_frames = []
59
+ pca = PCA(n_components=3)
60
+ scaler = MinMaxScaler(clip=True)
61
+
62
+ for i in range(len(frames)//batch_size):
63
+ batch = frames[i*batch_size:batch_size*(i+1)]
64
+ pixel_values = [
65
+ preprocess(f, n_patches, device, patch_size).squeeze(0) for f in batch
66
+ ]
67
+ pixel_values = torch.stack(pixel_values) # B, C, NP * P, NP * P
68
+
69
+ with torch.no_grad():
70
+ out = model(pixel_values=pixel_values)
71
+
72
+ features = out.last_hidden_state[:, 1:] # B, P * P, HIDDEN_DIM
73
+ features = features.cpu().numpy()
74
+ features = features.reshape(batch_size * n_patches * n_patches, -1) # B * P * P, HIDDEN_DIM
75
+
76
+ pca_features = pca.fit_transform(features)
77
+ pca_features = scaler.fit_transform(pca_features)
78
+
79
+ if is_larger:
80
+ pca_features_bg = pca_features[:, 0] > threshold
81
+ else:
82
+ pca_features_bg = pca_features[:, 0] < threshold
83
+
84
+
85
+ pca_features_fg = ~pca_features_bg
86
+
87
+ pca_features_fg_seg = pca.fit_transform(features[pca_features_fg])
88
+
89
+ pca_features_fg_seg = scaler.fit_transform(pca_features_fg_seg)
90
+
91
+ pca_features_rgb = np.zeros((batch_size * n_patches * n_patches, 3))
92
+ pca_features_rgb[pca_features_bg] = 0
93
+ pca_features_rgb[pca_features_fg] = pca_features_fg_seg
94
+ pca_features_rgb = pca_features_rgb.reshape(batch_size, n_patches, n_patches, 3)
95
+
96
+ if interpolate:
97
+ # transformed into torch tensor
98
+ pca_features_rgb = torch.from_numpy(pca_features_rgb) # B, P, P, 3
99
+ # reshaped to B, C, P, P
100
+ pca_features_rgb = pca_features_rgb.permute(0, 3, 1, 2)
101
+ # interpolate to B, C, H, W
102
+ # reshaped to B, H, W, C
103
+ # unbind to a list of len B with np.ndarray of shape H, W, C
104
+ pca_features_rgb = F.interpolate(
105
+ pca_features_rgb,
106
+ size=(original_height, original_width),
107
+ mode='bilinear',
108
+ align_corners=False
109
+ ).permute(0, 2, 3, 1).unbind(0)
110
+ # Fixing range to np.uint8
111
+ else:
112
+ pca_features_rgb = [f for f in pca_features_rgb]
113
+ # Adding to final_frames list
114
+ final_frames.extend(pca_features_rgb)
115
+
116
+ return final_frames
117
+
118
+
119
+ def create_video_from_frames_rgb(
120
+ frame_list: List[np.ndarray],
121
+ output_filename: str = "animation.mp4",
122
+ fps: int = 15
123
+ ) -> str:
124
+ # Get the shape of the frames to determine video dimensions
125
+ frame_height, frame_width, _ = frame_list[0].shape
126
+
127
+ # Define the codec and create a VideoWriter object
128
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v') # You can change the codec as needed
129
+ out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))
130
+
131
+ for frame in frame_list:
132
+ # Convert the frame from RGB to BGR
133
+ bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
134
+
135
+ # Write the frame to the video file
136
+ out.write(bgr_frame)
137
+
138
+ # Release the VideoWriter object
139
+ out.release()
140
+
141
+ return output_filename