cesar commited on
Commit
64e21e1
·
1 Parent(s): 10dee64

Upload 3 files

Browse files
Files changed (3) hide show
  1. utils/constants.py +4 -0
  2. utils/custom_layers.py +67 -0
  3. utils/predict.py +104 -0
utils/constants.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ MAX_SEQ_LENGTH = 20
2
+ NUM_FEATURES = 1024
3
+ IMG_SIZE = 128
4
+ CLASS_VOCAB = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
utils/custom_layers.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow import keras
3
+ from keras import layers
4
+
5
+
6
+ class PositionalEmbedding(layers.Layer):
7
+ def __init__(self, sequence_length, output_dim, **kwargs):
8
+ super().__init__(**kwargs)
9
+ self.position_embeddings = layers.Embedding(
10
+ input_dim=sequence_length, output_dim=output_dim
11
+ )
12
+ self.sequence_length = sequence_length
13
+ self.output_dim = output_dim
14
+
15
+ def call(self, inputs):
16
+ # The inputs are of shape: `(batch_size, frames, num_features)`
17
+ length = tf.shape(inputs)[1]
18
+ positions = tf.range(start=0, limit=length, delta=1)
19
+ embedded_positions = self.position_embeddings(positions)
20
+ return inputs + embedded_positions
21
+
22
+ def compute_mask(self, inputs, mask=None):
23
+ mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
24
+ return mask
25
+
26
+ def get_config(self):
27
+ config = super().get_config()
28
+ config.update({
29
+ "sequence_length": self.sequence_length,
30
+ "output_dim": self.output_dim,
31
+ })
32
+ return config
33
+
34
+
35
+ class TransformerEncoder(layers.Layer):
36
+ def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.embed_dim = embed_dim
39
+ self.dense_dim = dense_dim
40
+ self.num_heads = num_heads
41
+ self.attention = layers.MultiHeadAttention(
42
+ num_heads=num_heads, key_dim=embed_dim, dropout=0.3
43
+ )
44
+ self.dense_proj = keras.Sequential(
45
+ [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
46
+ )
47
+ self.layernorm_1 = layers.LayerNormalization()
48
+ self.layernorm_2 = layers.LayerNormalization()
49
+
50
+ def call(self, inputs, mask=None):
51
+ if mask is not None:
52
+ mask = mask[:, tf.newaxis, :]
53
+
54
+ attention_output = self.attention(inputs, inputs, attention_mask=mask)
55
+ proj_input = self.layernorm_1(inputs + attention_output)
56
+ proj_output = self.dense_proj(proj_input)
57
+ return self.layernorm_2(proj_input + proj_output)
58
+
59
+
60
+ def get_config(self):
61
+ config = super().get_config()
62
+ config.update({
63
+ "embed_dim": self.embed_dim,
64
+ "dense_dim": self.dense_dim,
65
+ "num_heads": self.num_heads,
66
+ })
67
+ return config
utils/predict.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #from .custom_layers import TransformerEncoder, PositionalEmbedding
2
+ from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
3
+ from huggingface_hub import from_pretrained_keras
4
+ from tensorflow import keras
5
+ from keras import layers
6
+ import numpy as np
7
+ import imageio
8
+ import cv2
9
+
10
+ #model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})
11
+
12
+ model = from_pretrained_keras("keras-io/video-transformers")
13
+
14
+ """
15
+ Below code is taken from the Video-Transformers example on keras-io by Sayak Paul
16
+ """
17
+ def build_feature_extractor():
18
+ feature_extractor = keras.applications.DenseNet121(
19
+ weights="imagenet",
20
+ include_top=False,
21
+ pooling="avg",
22
+ input_shape=(IMG_SIZE, IMG_SIZE, 3),
23
+ )
24
+ preprocess_input = keras.applications.densenet.preprocess_input
25
+
26
+ inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
27
+ preprocessed = preprocess_input(inputs)
28
+
29
+ outputs = feature_extractor(preprocessed)
30
+ return keras.Model(inputs, outputs, name="feature_extractor")
31
+
32
+
33
+ feature_extractor = build_feature_extractor()
34
+
35
+
36
+
37
+ def crop_center(frame):
38
+ center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
39
+ cropped = center_crop_layer(frame[None, ...])
40
+ cropped = cropped.numpy().squeeze()
41
+ return cropped
42
+
43
+ def load_video(path, max_frames=0):
44
+ cap = cv2.VideoCapture(path)
45
+ frames = []
46
+ try:
47
+ while True:
48
+ ret, frame = cap.read()
49
+ if not ret:
50
+ break
51
+ frame = crop_center(frame)
52
+ frame = frame[:, :, [2, 1, 0]]
53
+ frames.append(frame)
54
+
55
+ if len(frames) == max_frames:
56
+ break
57
+ finally:
58
+ cap.release()
59
+ return np.array(frames)
60
+
61
+ def prepare_single_video(frames):
62
+ frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
63
+
64
+ # Pad shorter videos.
65
+ if len(frames) < MAX_SEQ_LENGTH:
66
+ diff = MAX_SEQ_LENGTH - len(frames)
67
+ padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
68
+ frames = np.concatenate(frames, padding)
69
+
70
+ frames = frames[None, ...]
71
+
72
+ # Extract features from the frames of the current video.
73
+ for i, batch in enumerate(frames):
74
+ video_length = batch.shape[0]
75
+ length = min(MAX_SEQ_LENGTH, video_length)
76
+ for j in range(length):
77
+ if np.mean(batch[j, :]) > 0.0:
78
+ frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
79
+ else:
80
+ frame_features[i, j, :] = 0.0
81
+
82
+ return frame_features
83
+
84
+
85
+ def predict_action(path):
86
+ frames = load_video(path)
87
+ frame_features = prepare_single_video(frames)
88
+ probabilities = model.predict(frame_features)[0]
89
+ confidences = {}
90
+
91
+ for i in np.argsort(probabilities)[::-1]:
92
+ confidences[CLASS_VOCAB[i]] = float(probabilities[i])
93
+
94
+ gif_out = to_gif(frames[:MAX_SEQ_LENGTH])
95
+
96
+ print(confidences)
97
+ return confidences, gif_out
98
+
99
+
100
+ def to_gif(images):
101
+ converted_images = images.astype(np.uint8)
102
+ imageio.mimsave("animation.gif", converted_images, fps=10)
103
+ return "animation.gif"
104
+