IbrahimHasani commited on
Commit
b8466ce
1 Parent(s): 4ffad60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -35
app.py CHANGED
@@ -1,21 +1,17 @@
1
  import gradio as gr
2
  import torch
 
 
 
 
 
3
 
4
  print(f"Is CUDA available: {torch.cuda.is_available()}")
5
  # True
6
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
7
  # Tesla T4
8
 
9
- import numpy as np
10
- from transformers import AutoProcessor, AutoModel
11
- from PIL import Image
12
- from decord import VideoReader, gpu
13
-
14
  def sample_uniform_frame_indices(clip_len, seg_len):
15
- """
16
- Samples `clip_len` uniformly spaced frame indices from a video of length `seg_len`.
17
- Handles edge cases where `seg_len` might be less than `clip_len`.
18
- """
19
  if seg_len < clip_len:
20
  repeat_factor = np.ceil(clip_len / seg_len).astype(int)
21
  indices = np.arange(seg_len).tolist() * repeat_factor
@@ -23,24 +19,32 @@ def sample_uniform_frame_indices(clip_len, seg_len):
23
  else:
24
  spacing = seg_len // clip_len
25
  indices = [i * spacing for i in range(clip_len)]
26
-
27
  return np.array(indices).astype(np.int64)
28
 
29
  def read_video_decord(file_path, indices):
30
- vr = VideoReader(file_path, num_threads=1, ctx=gpu(0))
31
  video = vr.get_batch(indices).asnumpy()
32
  return video
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def concatenate_frames(frames, clip_len):
35
- assert len(frames) == clip_len, f"The function expects {clip_len} frames as input."
36
-
37
  layout = {
38
  32: (4, 8),
39
  16: (4, 4),
40
  8: (2, 4)
41
  }
42
  rows, cols = layout[clip_len]
43
-
44
  combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
45
  frame_iter = iter(frames)
46
  y_offset = 0
@@ -51,26 +55,22 @@ def concatenate_frames(frames, clip_len):
51
  combined_image.paste(img, (x_offset, y_offset))
52
  x_offset += frames[0].shape[1]
53
  y_offset += frames[0].shape[0]
54
-
55
  return combined_image
56
 
57
-
58
- def model_interface(uploaded_video, model_choice, activities):
59
  clip_len = {
60
  "microsoft/xclip-base-patch16-zero-shot": 32,
61
  "microsoft/xclip-base-patch32-16-frames": 16,
62
  "microsoft/xclip-base-patch32": 8
63
  }.get(model_choice, 32)
64
-
65
  indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
66
- video = read_video_decord(uploaded_video, indices)
67
- concatenated_image = concatenate_frames(video, clip_len) # Passed clip_len as argument
68
 
 
 
69
  processor = AutoProcessor.from_pretrained(model_choice)
70
  model = AutoModel.from_pretrained(model_choice)
71
- model = model.to("cuda")
72
-
73
- activities_list = activities.split(",")
74
  inputs = processor(
75
  text=activities_list,
76
  videos=list(video),
@@ -86,19 +86,18 @@ def model_interface(uploaded_video, model_choice, activities):
86
 
87
  results_probs = []
88
  results_logits = []
 
89
  for i in range(len(activities_list)):
90
- activity = activities_list[i]
91
  prob = float(probs[0][i])
92
  logit = float(logits_per_video[0][i])
93
- results_probs.append((activity, f"Probability: {prob * 100:.2f}%"))
94
- results_logits.append((activity, f"Raw Score: {logit:.2f}"))
95
 
96
- # Retrieve most likely predicted label and its probability
97
- max_prob_idx = probs[0].argmax().item()
98
- most_likely_activity = activities_list[max_prob_idx]
99
- most_likely_prob = float(probs[0][max_prob_idx])
100
 
101
- return concatenated_image, results_probs, results_logits, (most_likely_activity, f"Probability: {most_likely_prob * 100:.2f}%")
102
 
103
  iface = gr.Interface(
104
  fn=model_interface,
@@ -109,15 +108,15 @@ iface = gr.Interface(
109
  "microsoft/xclip-base-patch32-16-frames",
110
  "microsoft/xclip-base-patch32"
111
  ], label="Model Choice"),
112
- gr.components.Textbox(lines=4, label="Enter activities (comma-separated)"),
113
  ],
114
  outputs=[
115
- gr.components.Image(type="pil", label="sampled frames"),
116
  gr.components.Textbox(type="text", label="Probabilities"),
117
  gr.components.Textbox(type="text", label="Raw Scores"),
118
- gr.components.Textbox(type="text", label="Most Likely Prediction")
119
  ],
120
  live=False
121
  )
122
 
123
- iface.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import numpy as np
4
+ from transformers import AutoProcessor, AutoModel
5
+ from PIL import Image
6
+ from decord import VideoReader, cpu
7
+ import cv2
8
 
9
  print(f"Is CUDA available: {torch.cuda.is_available()}")
10
  # True
11
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
12
  # Tesla T4
13
 
 
 
 
 
 
14
  def sample_uniform_frame_indices(clip_len, seg_len):
 
 
 
 
15
  if seg_len < clip_len:
16
  repeat_factor = np.ceil(clip_len / seg_len).astype(int)
17
  indices = np.arange(seg_len).tolist() * repeat_factor
 
19
  else:
20
  spacing = seg_len // clip_len
21
  indices = [i * spacing for i in range(clip_len)]
 
22
  return np.array(indices).astype(np.int64)
23
 
24
  def read_video_decord(file_path, indices):
25
+ vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
26
  video = vr.get_batch(indices).asnumpy()
27
  return video
28
 
29
+ def read_video_opencv(file_path, indices):
30
+ vidcap = cv2.VideoCapture(file_path)
31
+ frames = []
32
+ for idx in indices:
33
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
34
+ success, image = vidcap.read()
35
+ if success:
36
+ # Convert BGR to RGB
37
+ frames.append(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
38
+ return np.array(frames)
39
+
40
+
41
  def concatenate_frames(frames, clip_len):
 
 
42
  layout = {
43
  32: (4, 8),
44
  16: (4, 4),
45
  8: (2, 4)
46
  }
47
  rows, cols = layout[clip_len]
 
48
  combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
49
  frame_iter = iter(frames)
50
  y_offset = 0
 
55
  combined_image.paste(img, (x_offset, y_offset))
56
  x_offset += frames[0].shape[1]
57
  y_offset += frames[0].shape[0]
 
58
  return combined_image
59
 
60
+ def model_interface(uploaded_video, model_choice, activity):
 
61
  clip_len = {
62
  "microsoft/xclip-base-patch16-zero-shot": 32,
63
  "microsoft/xclip-base-patch32-16-frames": 16,
64
  "microsoft/xclip-base-patch32": 8
65
  }.get(model_choice, 32)
 
66
  indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
67
+ video = read_video_opencv(uploaded_video, indices)
68
+ concatenated_image = concatenate_frames(video, clip_len)
69
 
70
+ # Appending "other" to the list of activities
71
+ activities_list = [activity, "other"]
72
  processor = AutoProcessor.from_pretrained(model_choice)
73
  model = AutoModel.from_pretrained(model_choice)
 
 
 
74
  inputs = processor(
75
  text=activities_list,
76
  videos=list(video),
 
86
 
87
  results_probs = []
88
  results_logits = []
89
+ max_prob_index = torch.argmax(probs[0]).item()
90
  for i in range(len(activities_list)):
91
+ current_activity = activities_list[i]
92
  prob = float(probs[0][i])
93
  logit = float(logits_per_video[0][i])
94
+ results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
95
+ results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
96
 
97
+ likely_label = activities_list[max_prob_index]
98
+ likely_probability = float(probs[0][max_prob_index]) * 100
 
 
99
 
100
+ return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
101
 
102
  iface = gr.Interface(
103
  fn=model_interface,
 
108
  "microsoft/xclip-base-patch32-16-frames",
109
  "microsoft/xclip-base-patch32"
110
  ], label="Model Choice"),
111
+ gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
112
  ],
113
  outputs=[
114
+ gr.components.Image(type="pil", label="Sampled Frames"),
115
  gr.components.Textbox(type="text", label="Probabilities"),
116
  gr.components.Textbox(type="text", label="Raw Scores"),
117
+ gr.components.Textbox(type="text", label="Top Prediction")
118
  ],
119
  live=False
120
  )
121
 
122
+ iface.launch()