JackWong0911's picture
Update app.py
b035eea verified
raw
history blame contribute delete
No virus
26.1 kB
import cv2
import gradio as gr
import imutils
import numpy as np
import torch
from pytorchvideo.transforms import (
ApplyTransformToKey,
Normalize,
RandomShortSideScale,
RemoveKey,
ShortSideScale,
UniformTemporalSubsample,
)
from torchvision.transforms import (
Compose,
Lambda,
RandomCrop,
RandomHorizontalFlip,
Resize,
)
# my code below
# import transformers.models.timesformer.modeling_timesformer
from transformers.models.timesformer.modeling_timesformer import TimeSformerDropPath, TimeSformerAttention, TimesformerIntermediate, TimesformerOutput, TimesformerLayer, TimesformerEncoder, TimesformerModel, TIMESFORMER_INPUTS_DOCSTRING, _CONFIG_FOR_DOC, TimesformerEmbeddings, TimesformerForVideoClassification
from transformers import TimesformerConfig
configuration = TimesformerConfig()
import collections
from typing import Optional, Tuple, Union
import torch
import torch.nn.functional
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.activations import ACT2FN
from transformers.modeling_outputs import BaseModelOutput, ImageClassifierOutput
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from transformers.models.timesformer.configuration_timesformer import TimesformerConfig
class MyTimesformerLayer(TimesformerLayer):
def __init__(self, config: configuration, layer_index: int) -> None:
super().__init__()
attention_type = config.attention_type
drop_path_rates = [
x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
] # stochastic depth decay rule
drop_path_rate = drop_path_rates[layer_index]
self.drop_path = TimeSformerDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.attention = TimeSformerAttention(config)
self.intermediate = TimesformerIntermediate(config)
self.output = TimesformerOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.config = config
self.attention_type = attention_type
if attention_type not in ["divided_space_time", "space_only", "joint_space_time"]:
raise ValueError("Unknown attention type: {}".format(attention_type))
# Temporal Attention Parameters
if self.attention_type == "divided_space_time":
self.temporal_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.temporal_attention = TimeSformerAttention(config)
self.temporal_dense = nn.Linear(config.hidden_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False):
num_frames = self.config.num_frames
num_patch_width = self.config.image_size // self.config.patch_size
batch_size = hidden_states.shape[0]
num_spatial_tokens = (hidden_states.size(1) - 1) // num_frames
num_patch_height = num_spatial_tokens // num_patch_width
if self.attention_type in ["space_only", "joint_space_time"]:
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states), output_attentions=output_attentions
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
hidden_states = hidden_states + self.drop_path(attention_output)
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output)
layer_output = hidden_states + self.drop_path(layer_output)
outputs = (layer_output,) + outputs
return outputs
elif self.attention_type == "divided_space_time":
# Spatial
init_cls_token = hidden_states[:, 0, :].unsqueeze(1)
cls_token = init_cls_token.repeat(1, num_frames, 1)
cls_token = cls_token.reshape(batch_size * num_frames, 1, cls_token.shape[2])
spatial_embedding = hidden_states[:, 1:, :]
spatial_embedding = (
spatial_embedding.reshape(
batch_size, num_patch_height, num_patch_width, num_frames, spatial_embedding.shape[2]
)
.permute(0, 3, 1, 2, 4)
.reshape(batch_size * num_frames, num_patch_height * num_patch_width, spatial_embedding.shape[2])
)
spatial_embedding = torch.cat((cls_token, spatial_embedding), 1)
spatial_attention_outputs = self.attention(
self.layernorm_before(spatial_embedding), output_attentions=output_attentions
)
attention_output = spatial_attention_outputs[0]
outputs = spatial_attention_outputs[1:] # add self attentions if we output attention weights
residual_spatial = self.drop_path(attention_output)
# Taking care of CLS token
cls_token = residual_spatial[:, 0, :]
cls_token = cls_token.reshape(batch_size, num_frames, cls_token.shape[1])
cls_token = torch.mean(cls_token, 1, True) # averaging for every frame
residual_spatial = residual_spatial[:, 1:, :]
residual_spatial = (
residual_spatial.reshape(
batch_size, num_frames, num_patch_height, num_patch_width, residual_spatial.shape[2]
)
.permute(0, 2, 3, 1, 4)
.reshape(batch_size, num_patch_height * num_patch_width * num_frames, residual_spatial.shape[2])
)
residual = residual_spatial
hidden_states = hidden_states[:, 1:, :] + residual_spatial
# Temporal
temporal_embedding = hidden_states
temporal_embedding = temporal_embedding.reshape(
batch_size, num_patch_height, num_patch_width, num_frames, temporal_embedding.shape[2]
).reshape(batch_size * num_patch_height * num_patch_width, num_frames, temporal_embedding.shape[2])
temporal_attention_outputs = self.temporal_attention(
self.temporal_layernorm(temporal_embedding),
)
attention_output = temporal_attention_outputs[0]
residual_temporal = self.drop_path(attention_output)
residual_temporal = residual_temporal.reshape(
batch_size, num_patch_height, num_patch_width, num_frames, residual_temporal.shape[2]
).reshape(batch_size, num_patch_height * num_patch_width * num_frames, residual_temporal.shape[2])
residual_temporal = self.temporal_dense(residual_temporal)
hidden_states = hidden_states + residual_temporal
# Mlp
hidden_states = torch.cat((init_cls_token, hidden_states), 1) + torch.cat((cls_token, residual_temporal), 1)
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output)
layer_output = hidden_states + self.drop_path(layer_output)
outputs = (layer_output,) + outputs
return outputs
import transformers.models.timesformer.modeling_timesformer
class MyTimesformerEncoder(TimesformerEncoder):
def __init__(self, config: configuration) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([MyTimesformerLayer(config, ind) for ind in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class MyTimesformerModel(TimesformerModel):
def __init__(self, config: configuration):
super().__init__(config)
self.config = config
self.embeddings = TimesformerEmbeddings(config)
self.encoder = TimesformerEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
r"""
Returns:
Examples:
```python
>>> import av
>>> import numpy as np
>>> from transformers import AutoImageProcessor, TimesformerModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... '''
... Sample a given number of frame indices from the video.
... Args:
... clip_len (`int`): Total number of frames to sample.
... frame_sample_rate (`int`): Sample every n-th frame.
... seg_len (`int`): Maximum allowed index of sample's last frame.
... Returns:
... indices (`List[int]`): List of sampled frame indices
... '''
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)
>>> # sample 8 frames
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)
>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400")
>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")
>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 1569, 768]
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
embedding_output = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if self.layernorm is not None:
sequence_output = self.layernorm(sequence_output)
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class MyTimesformerForVideoClassification(TimesformerForVideoClassification):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.timesformer = MyTimesformerModel(config)
# Classifier head
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
Examples:
```python
>>> import av
>>> import torch
>>> import numpy as np
>>> from transformers import AutoImageProcessor, TimesformerForVideoClassification
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... '''
... Sample a given number of frame indices from the video.
... Args:
... clip_len (`int`): Total number of frames to sample.
... frame_sample_rate (`int`): Sample every n-th frame.
... seg_len (`int`): Maximum allowed index of sample's last frame.
... Returns:
... indices (`List[int]`): List of sampled frame indices
... '''
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)
>>> # sample 8 frames
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)
>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
>>> inputs = image_processor(list(video), return_tensors="pt")
>>> with torch.no_grad():
... outputs = model(**inputs)
... logits = outputs.logits
>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
eating spaghetti
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.timesformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0][:, 0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
from transformers import AutoImageProcessor
MODEL_CKPT = "JackWong0911/timesformer-base-finetuned-k400-kinetic400-subset-epoch6real-num_frame_10_myViT2_more_data"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL = MyTimesformerForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
PROCESSOR = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
RESIZE_TO = PROCESSOR.size["shortest_edge"]
NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
IMAGE_STATS = {"image_mean": [0.485, 0.456, 0.406], "image_std": [0.229, 0.224, 0.225]}
VAL_TRANSFORMS = Compose(
[
UniformTemporalSubsample(NUM_FRAMES_TO_SAMPLE),
Lambda(lambda x: x / 255.0),
Normalize(IMAGE_STATS["image_mean"], IMAGE_STATS["image_std"]),
Resize((RESIZE_TO, RESIZE_TO)),
]
)
LABELS = list(MODEL.config.label2id.keys())
def parse_video(video_file):
"""A utility to parse the input videos.
Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
"""
vs = cv2.VideoCapture(video_file)
# try to determine the total number of frames in the video file
try:
prop = (
cv2.cv.CV_CAP_PROP_FRAME_COUNT
if imutils.is_cv2()
else cv2.CAP_PROP_FRAME_COUNT
)
total = int(vs.get(prop))
print("[INFO] {} total frames in video".format(total))
# an error occurred while trying to determine the total
# number of frames in the video file
except:
print("[INFO] could not determine # of frames in video")
print("[INFO] no approx. completion time can be provided")
total = -1
frames = []
# loop over frames from the video file stream
while True:
# read the next frame from the file
(grabbed, frame) = vs.read()
if frame is not None:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(frame)
# if the frame was not grabbed, then we have reached the end
# of the stream
if not grabbed:
break
return frames
def preprocess_video(frames: list):
"""Utility to apply preprocessing transformations to a video tensor."""
# Each frame in the `frames` list has the shape: (height, width, num_channels).
# Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
# So, after converting the `frames` list to a torch tensor, we permute the shape
# such that it becomes (num_channels, num_frames, height, width) to make
# the shape compatible with the preprocessing transformations. After applying the
# preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
# to make it compatible with the model. Finally, we add a batch dimension so that our video
# classification model can operate on it.
video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
video_tensor = video_tensor.permute(
3, 0, 1, 2
) # (num_channels, num_frames, height, width)
video_tensor_pp = VAL_TRANSFORMS(video_tensor)
video_tensor_pp = video_tensor_pp.permute(
1, 0, 2, 3
) # (num_frames, num_channels, height, width)
video_tensor_pp = video_tensor_pp.unsqueeze(0)
return video_tensor_pp.to(DEVICE)
def infer(video_file):
frames = parse_video(video_file)
video_tensor = preprocess_video(frames)
inputs = {"pixel_values": video_tensor}
# forward pass
with torch.no_grad():
outputs = MODEL(**inputs)
logits = outputs.logits
softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
return confidences
gr.Interface(
fn=infer,
inputs=gr.Video(type="file"),
outputs=gr.Label(num_top_classes=3),
examples=[
["examples/archery.mp4"],
["examples/bowling.mp4"],
["examples/flying_kite.mp4"],
["examples/high_jump.mp4"],
["examples/marching.mp4"],
],
title="MyViT fine-tuned on a subset of Kinetics400",
description=(
"Gradio demo for MyViT for video classification. To use it, simply upload your video or click one of the"
" examples to load them. Read more at the links below."
),
article=(
"<div style='text-align: center;'><p>MyViT</p>"
" <center><a href='https://huggingface.co/JackWong0911/timesformer-base-finetuned-k400-kinetic400-subset-epoch6real-num_frame_10_myViT2_more_data' target='_blank'>Fine-tuned Model</a></center></div>"
),
allow_flagging=False,
allow_screenshot=False,
share=True,
).launch()