Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- utils/constants.py +4 -0
- utils/custom_layers.py +67 -0
- utils/predict.py +104 -0
utils/constants.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MAX_SEQ_LENGTH = 20
|
2 |
+
NUM_FEATURES = 1024
|
3 |
+
IMG_SIZE = 128
|
4 |
+
CLASS_VOCAB = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
|
utils/custom_layers.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from tensorflow import keras
|
3 |
+
from keras import layers
|
4 |
+
|
5 |
+
|
6 |
+
class PositionalEmbedding(layers.Layer):
|
7 |
+
def __init__(self, sequence_length, output_dim, **kwargs):
|
8 |
+
super().__init__(**kwargs)
|
9 |
+
self.position_embeddings = layers.Embedding(
|
10 |
+
input_dim=sequence_length, output_dim=output_dim
|
11 |
+
)
|
12 |
+
self.sequence_length = sequence_length
|
13 |
+
self.output_dim = output_dim
|
14 |
+
|
15 |
+
def call(self, inputs):
|
16 |
+
# The inputs are of shape: `(batch_size, frames, num_features)`
|
17 |
+
length = tf.shape(inputs)[1]
|
18 |
+
positions = tf.range(start=0, limit=length, delta=1)
|
19 |
+
embedded_positions = self.position_embeddings(positions)
|
20 |
+
return inputs + embedded_positions
|
21 |
+
|
22 |
+
def compute_mask(self, inputs, mask=None):
|
23 |
+
mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
|
24 |
+
return mask
|
25 |
+
|
26 |
+
def get_config(self):
|
27 |
+
config = super().get_config()
|
28 |
+
config.update({
|
29 |
+
"sequence_length": self.sequence_length,
|
30 |
+
"output_dim": self.output_dim,
|
31 |
+
})
|
32 |
+
return config
|
33 |
+
|
34 |
+
|
35 |
+
class TransformerEncoder(layers.Layer):
|
36 |
+
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
|
37 |
+
super().__init__(**kwargs)
|
38 |
+
self.embed_dim = embed_dim
|
39 |
+
self.dense_dim = dense_dim
|
40 |
+
self.num_heads = num_heads
|
41 |
+
self.attention = layers.MultiHeadAttention(
|
42 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.3
|
43 |
+
)
|
44 |
+
self.dense_proj = keras.Sequential(
|
45 |
+
[layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
|
46 |
+
)
|
47 |
+
self.layernorm_1 = layers.LayerNormalization()
|
48 |
+
self.layernorm_2 = layers.LayerNormalization()
|
49 |
+
|
50 |
+
def call(self, inputs, mask=None):
|
51 |
+
if mask is not None:
|
52 |
+
mask = mask[:, tf.newaxis, :]
|
53 |
+
|
54 |
+
attention_output = self.attention(inputs, inputs, attention_mask=mask)
|
55 |
+
proj_input = self.layernorm_1(inputs + attention_output)
|
56 |
+
proj_output = self.dense_proj(proj_input)
|
57 |
+
return self.layernorm_2(proj_input + proj_output)
|
58 |
+
|
59 |
+
|
60 |
+
def get_config(self):
|
61 |
+
config = super().get_config()
|
62 |
+
config.update({
|
63 |
+
"embed_dim": self.embed_dim,
|
64 |
+
"dense_dim": self.dense_dim,
|
65 |
+
"num_heads": self.num_heads,
|
66 |
+
})
|
67 |
+
return config
|
utils/predict.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#from .custom_layers import TransformerEncoder, PositionalEmbedding
|
2 |
+
from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
|
3 |
+
from huggingface_hub import from_pretrained_keras
|
4 |
+
from tensorflow import keras
|
5 |
+
from keras import layers
|
6 |
+
import numpy as np
|
7 |
+
import imageio
|
8 |
+
import cv2
|
9 |
+
|
10 |
+
#model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})
|
11 |
+
|
12 |
+
model = from_pretrained_keras("keras-io/video-transformers")
|
13 |
+
|
14 |
+
"""
|
15 |
+
Below code is taken from the Video-Transformers example on keras-io by Sayak Paul
|
16 |
+
"""
|
17 |
+
def build_feature_extractor():
|
18 |
+
feature_extractor = keras.applications.DenseNet121(
|
19 |
+
weights="imagenet",
|
20 |
+
include_top=False,
|
21 |
+
pooling="avg",
|
22 |
+
input_shape=(IMG_SIZE, IMG_SIZE, 3),
|
23 |
+
)
|
24 |
+
preprocess_input = keras.applications.densenet.preprocess_input
|
25 |
+
|
26 |
+
inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
|
27 |
+
preprocessed = preprocess_input(inputs)
|
28 |
+
|
29 |
+
outputs = feature_extractor(preprocessed)
|
30 |
+
return keras.Model(inputs, outputs, name="feature_extractor")
|
31 |
+
|
32 |
+
|
33 |
+
feature_extractor = build_feature_extractor()
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
def crop_center(frame):
|
38 |
+
center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
|
39 |
+
cropped = center_crop_layer(frame[None, ...])
|
40 |
+
cropped = cropped.numpy().squeeze()
|
41 |
+
return cropped
|
42 |
+
|
43 |
+
def load_video(path, max_frames=0):
|
44 |
+
cap = cv2.VideoCapture(path)
|
45 |
+
frames = []
|
46 |
+
try:
|
47 |
+
while True:
|
48 |
+
ret, frame = cap.read()
|
49 |
+
if not ret:
|
50 |
+
break
|
51 |
+
frame = crop_center(frame)
|
52 |
+
frame = frame[:, :, [2, 1, 0]]
|
53 |
+
frames.append(frame)
|
54 |
+
|
55 |
+
if len(frames) == max_frames:
|
56 |
+
break
|
57 |
+
finally:
|
58 |
+
cap.release()
|
59 |
+
return np.array(frames)
|
60 |
+
|
61 |
+
def prepare_single_video(frames):
|
62 |
+
frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
|
63 |
+
|
64 |
+
# Pad shorter videos.
|
65 |
+
if len(frames) < MAX_SEQ_LENGTH:
|
66 |
+
diff = MAX_SEQ_LENGTH - len(frames)
|
67 |
+
padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
|
68 |
+
frames = np.concatenate(frames, padding)
|
69 |
+
|
70 |
+
frames = frames[None, ...]
|
71 |
+
|
72 |
+
# Extract features from the frames of the current video.
|
73 |
+
for i, batch in enumerate(frames):
|
74 |
+
video_length = batch.shape[0]
|
75 |
+
length = min(MAX_SEQ_LENGTH, video_length)
|
76 |
+
for j in range(length):
|
77 |
+
if np.mean(batch[j, :]) > 0.0:
|
78 |
+
frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
|
79 |
+
else:
|
80 |
+
frame_features[i, j, :] = 0.0
|
81 |
+
|
82 |
+
return frame_features
|
83 |
+
|
84 |
+
|
85 |
+
def predict_action(path):
|
86 |
+
frames = load_video(path)
|
87 |
+
frame_features = prepare_single_video(frames)
|
88 |
+
probabilities = model.predict(frame_features)[0]
|
89 |
+
confidences = {}
|
90 |
+
|
91 |
+
for i in np.argsort(probabilities)[::-1]:
|
92 |
+
confidences[CLASS_VOCAB[i]] = float(probabilities[i])
|
93 |
+
|
94 |
+
gif_out = to_gif(frames[:MAX_SEQ_LENGTH])
|
95 |
+
|
96 |
+
print(confidences)
|
97 |
+
return confidences, gif_out
|
98 |
+
|
99 |
+
|
100 |
+
def to_gif(images):
|
101 |
+
converted_images = images.astype(np.uint8)
|
102 |
+
imageio.mimsave("animation.gif", converted_images, fps=10)
|
103 |
+
return "animation.gif"
|
104 |
+
|