IbrahimHasani commited on
Commit
a29b529
1 Parent(s): 5186ead

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -27
app.py CHANGED
@@ -3,19 +3,32 @@ import torch
3
  import numpy as np
4
  from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
- from decord import VideoReader, cpu, gpu
7
 
8
  MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
9
  CLIP_LEN = 32
10
 
11
- # Check for GPU availability
12
- device = "cuda" if torch.cuda.is_available() else "cpu"
13
- print (device)
14
-
15
- # Load model and processor once and move them to the GPU
16
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
17
- model = AutoModel.from_pretrained(MODEL_NAME).to(device)
18
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def sample_uniform_frame_indices(clip_len, seg_len):
21
  if seg_len < clip_len:
@@ -27,12 +40,7 @@ def sample_uniform_frame_indices(clip_len, seg_len):
27
  indices = [i * spacing for i in range(clip_len)]
28
  return np.array(indices).astype(np.int64)
29
 
30
- def read_video_decord(file_path, indices):
31
- # Use GPU for video decoding if available
32
- vr_ctx = cpu(0)
33
- vr = VideoReader(file_path, num_threads=1, ctx=vr_ctx)
34
- video = vr.get_batch(indices).asnumpy()
35
- return video
36
 
37
  def concatenate_frames(frames, clip_len):
38
  layout = { 32: (4, 8) }
@@ -50,24 +58,18 @@ def concatenate_frames(frames, clip_len):
50
  return combined_image
51
 
52
  def model_interface(uploaded_video, activity):
53
- indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=len(VideoReader(uploaded_video)))
54
- video = read_video_decord(uploaded_video, indices)
 
55
  concatenated_image = concatenate_frames(video, CLIP_LEN)
56
 
57
  activities_list = [activity, "other"]
58
-
59
- # Convert list of numpy.ndarrays to a single numpy.ndarray
60
- video_array = np.array(video)
61
-
62
  inputs = processor(
63
  text=activities_list,
64
- videos=video_array,
65
  return_tensors="pt",
66
  padding=True,
67
  )
68
-
69
- # Move inputs to GPU
70
- inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
71
 
72
  with torch.no_grad():
73
  outputs = model(**inputs)
@@ -80,13 +82,13 @@ def model_interface(uploaded_video, activity):
80
  max_prob_index = torch.argmax(probs[0]).item()
81
  for i in range(len(activities_list)):
82
  current_activity = activities_list[i]
83
- prob = float(probs[0][i].cpu())
84
- logit = float(logits_per_video[0][i].cpu())
85
  results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
86
  results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
87
 
88
  likely_label = activities_list[max_prob_index]
89
- likely_probability = float(probs[0][max_prob_index].cpu()) * 100
90
 
91
  return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
92
 
 
3
  import numpy as np
4
  from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
+ import cv2
7
 
8
  MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
9
  CLIP_LEN = 32
10
 
11
+ # Load model and processor once
 
 
 
 
12
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
13
+ model = AutoModel.from_pretrained(MODEL_NAME)
14
+
15
+
16
+ def get_video_length(file_path):
17
+ cap = cv2.VideoCapture(file_path)
18
+ length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
19
+ cap.release()
20
+ return length
21
+
22
+ def read_video_opencv(file_path, indices):
23
+ cap = cv2.VideoCapture(file_path)
24
+ frames = []
25
+ for i in indices:
26
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
27
+ ret, frame = cap.read()
28
+ if ret:
29
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
30
+ cap.release()
31
+ return frames
32
 
33
  def sample_uniform_frame_indices(clip_len, seg_len):
34
  if seg_len < clip_len:
 
40
  indices = [i * spacing for i in range(clip_len)]
41
  return np.array(indices).astype(np.int64)
42
 
43
+
 
 
 
 
 
44
 
45
  def concatenate_frames(frames, clip_len):
46
  layout = { 32: (4, 8) }
 
58
  return combined_image
59
 
60
  def model_interface(uploaded_video, activity):
61
+ video_length = get_video_length(uploaded_video)
62
+ indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=video_length)
63
+ video = read_video_opencv(uploaded_video, indices)
64
  concatenated_image = concatenate_frames(video, CLIP_LEN)
65
 
66
  activities_list = [activity, "other"]
 
 
 
 
67
  inputs = processor(
68
  text=activities_list,
69
+ videos=list(video),
70
  return_tensors="pt",
71
  padding=True,
72
  )
 
 
 
73
 
74
  with torch.no_grad():
75
  outputs = model(**inputs)
 
82
  max_prob_index = torch.argmax(probs[0]).item()
83
  for i in range(len(activities_list)):
84
  current_activity = activities_list[i]
85
+ prob = float(probs[0][i])
86
+ logit = float(logits_per_video[0][i])
87
  results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
88
  results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
89
 
90
  likely_label = activities_list[max_prob_index]
91
+ likely_probability = float(probs[0][max_prob_index]) * 100
92
 
93
  return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
94