IbrahimHasani commited on
Commit
b4b5272
1 Parent(s): 06c6341

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import AutoProcessor, AutoModel
5
+ from PIL import Image
6
+ import cv2
7
+ from pathlib import Path
8
+ from tempfile import NamedTemporaryFile
9
+
10
+ MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
11
+ CLIP_LEN = 32
12
+
13
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+
15
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
16
+ model = AutoModel.from_pretrained(MODEL_NAME).to(device)
17
+
18
+ def get_video_length(file_path):
19
+ cap = cv2.VideoCapture(file_path)
20
+ length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
21
+ cap.release()
22
+ return length
23
+
24
+ def read_video_opencv(file_path, indices):
25
+ frames = []
26
+ failed_indices = []
27
+
28
+ cap = cv2.VideoCapture(file_path)
29
+ if not cap.isOpened():
30
+ print(f"Error opening video file: {file_path}")
31
+ return frames
32
+
33
+ max_index = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
34
+ for idx in indices:
35
+ if idx <= max_index:
36
+ frame = get_frame_with_opened_cap(cap, idx)
37
+ if frame is not None:
38
+ frames.append(frame)
39
+ else:
40
+ failed_indices.append(idx)
41
+ else:
42
+ failed_indices.append(idx)
43
+ cap.release()
44
+
45
+ if failed_indices:
46
+ print(f"Failed to extract frames at indices: {failed_indices}")
47
+ return frames
48
+
49
+ def get_frame_with_opened_cap(cap, index):
50
+ cap.set(cv2.CAP_PROP_POS_FRAMES, index)
51
+ ret, frame = cap.read()
52
+ if ret:
53
+ return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
54
+ return None
55
+
56
+ def sample_uniform_frame_indices(clip_len, seg_len):
57
+ if seg_len < clip_len:
58
+ repeat_factor = np.ceil(clip_len / seg_len).astype(int)
59
+ indices = np.arange(seg_len).tolist() * repeat_factor
60
+ indices = indices[:clip_len]
61
+ else:
62
+ spacing = seg_len // clip_len
63
+ indices = [i * spacing for i in range(clip_len)]
64
+ return np.array(indices).astype(np.int64)
65
+
66
+ def concatenate_frames(frames, clip_len):
67
+ layout = { 32: (4, 8) }
68
+ rows, cols = layout[clip_len]
69
+ combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
70
+ frame_iter = iter(frames)
71
+ y_offset = 0
72
+ for i in range(rows):
73
+ x_offset = 0
74
+ for j in range(cols):
75
+ img = Image.fromarray(next(frame_iter))
76
+ combined_image.paste(img, (x_offset, y_offset))
77
+ x_offset += frames[0].shape[1]
78
+ y_offset += frames[0].shape[0]
79
+ return combined_image
80
+
81
+ def model_interface(uploaded_video, activity):
82
+ video_length = get_video_length(uploaded_video)
83
+ indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=video_length)
84
+ video = read_video_opencv(uploaded_video, indices)
85
+ concatenated_image = concatenate_frames(video, CLIP_LEN)
86
+
87
+ activities_list = [activity, "other"]
88
+ inputs = processor(
89
+ text=activities_list,
90
+ videos=list(video),
91
+ return_tensors="pt",
92
+ padding=True,
93
+ )
94
+
95
+ for key, value in inputs.items():
96
+ if isinstance(value, torch.Tensor):
97
+ inputs[key] = value.to(device)
98
+
99
+ with torch.no_grad():
100
+ outputs = model(**inputs)
101
+
102
+ logits_per_video = outputs.logits_per_video
103
+ probs = logits_per_video.softmax(dim=1)
104
+
105
+ results_probs = []
106
+ results_logits = []
107
+ max_prob_index = torch.argmax(probs[0]).item()
108
+ for i in range(len(activities_list)):
109
+ current_activity = activities_list[i]
110
+ prob = float(probs[0][i].cpu())
111
+ logit = float(logits_per_video[0][i].cpu())
112
+ results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
113
+ results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
114
+
115
+ likely_label = activities_list[max_prob_index]
116
+ likely_probability = float(probs[0][max_prob_index].cpu()) * 100
117
+
118
+ return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]
119
+
120
+ iface = gr.Interface(
121
+ fn=model_interface,
122
+ inputs=[
123
+ gr.Video(label="Upload a Video"),
124
+ gr.Textbox(label="Activity to Detect")
125
+ ],
126
+ outputs=[
127
+ gr.Image(label="Concatenated Frames"),
128
+ gr.Dataframe(headers=["Activity", "Probability"], label="Probabilities"),
129
+ gr.Dataframe(headers=["Activity", "Raw Score"], label="Raw Scores"),
130
+ gr.Textbox(label="Most Likely Activity")
131
+ ],
132
+ title="Video Activity Classifier",
133
+ description="""
134
+ **Instructions:**
135
+
136
+ 1. **Upload a Video**: Select a video file to upload.
137
+ 2. **Enter Activity Label**: Specify the activity you want to detect in the video.
138
+ 3. **View Results**:
139
+ - The concatenated frames from the video will be displayed.
140
+ - Probabilities and raw scores for the specified activity and the "other" category will be shown.
141
+ - The most likely activity detected in the video will be displayed.
142
+ """
143
+ )
144
+
145
+ if __name__ == "__main__":
146
+ iface.launch()