Spaces:

IbrahimHasani
/

ActionDetectionVideo

Runtime error

App Files Files Community

ActionDetectionVideo / app.py

IbrahimHasani

Update app.py

b8466ce about 1 year ago

raw

history blame

4.21 kB

	import gradio as gr
	import torch
	import numpy as np
	from transformers import AutoProcessor, AutoModel
	from PIL import Image
	from decord import VideoReader, cpu
	import cv2

	print(f"Is CUDA available: {torch.cuda.is_available()}")
	# True
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
	# Tesla T4

	def sample_uniform_frame_indices(clip_len, seg_len):
	if seg_len < clip_len:
	repeat_factor = np.ceil(clip_len / seg_len).astype(int)
	indices = np.arange(seg_len).tolist() * repeat_factor
	indices = indices[:clip_len]
	else:
	spacing = seg_len // clip_len
	indices = [i * spacing for i in range(clip_len)]
	return np.array(indices).astype(np.int64)

	def read_video_decord(file_path, indices):
	vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
	video = vr.get_batch(indices).asnumpy()
	return video

	def read_video_opencv(file_path, indices):
	vidcap = cv2.VideoCapture(file_path)
	frames = []
	for idx in indices:
	vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	success, image = vidcap.read()
	if success:
	# Convert BGR to RGB
	frames.append(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
	return np.array(frames)


	def concatenate_frames(frames, clip_len):
	layout = {
	32: (4, 8),
	16: (4, 4),
	8: (2, 4)
	}
	rows, cols = layout[clip_len]
	combined_image = Image.new('RGB', (frames[0].shape[1]cols, frames[0].shape[0]rows))
	frame_iter = iter(frames)
	y_offset = 0
	for i in range(rows):
	x_offset = 0
	for j in range(cols):
	img = Image.fromarray(next(frame_iter))
	combined_image.paste(img, (x_offset, y_offset))
	x_offset += frames[0].shape[1]
	y_offset += frames[0].shape[0]
	return combined_image

	def model_interface(uploaded_video, model_choice, activity):
	clip_len = {
	"microsoft/xclip-base-patch16-zero-shot": 32,
	"microsoft/xclip-base-patch32-16-frames": 16,
	"microsoft/xclip-base-patch32": 8
	}.get(model_choice, 32)
	indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
	video = read_video_opencv(uploaded_video, indices)
	concatenated_image = concatenate_frames(video, clip_len)

	# Appending "other" to the list of activities
	activities_list = [activity, "other"]
	processor = AutoProcessor.from_pretrained(model_choice)
	model = AutoModel.from_pretrained(model_choice)
	inputs = processor(
	text=activities_list,
	videos=list(video),
	return_tensors="pt",
	padding=True,
	)

	with torch.no_grad():
	outputs = model(**inputs)

	logits_per_video = outputs.logits_per_video
	probs = logits_per_video.softmax(dim=1)

	results_probs = []
	results_logits = []
	max_prob_index = torch.argmax(probs[0]).item()
	for i in range(len(activities_list)):
	current_activity = activities_list[i]
	prob = float(probs[0][i])
	logit = float(logits_per_video[0][i])
	results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
	results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))

	likely_label = activities_list[max_prob_index]
	likely_probability = float(probs[0][max_prob_index]) * 100

	return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]

	iface = gr.Interface(
	fn=model_interface,
	inputs=[
	gr.components.Video(label="Upload a video file"),
	gr.components.Dropdown(choices=[
	"microsoft/xclip-base-patch16-zero-shot",
	"microsoft/xclip-base-patch32-16-frames",
	"microsoft/xclip-base-patch32"
	], label="Model Choice"),
	gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
	],
	outputs=[
	gr.components.Image(type="pil", label="Sampled Frames"),
	gr.components.Textbox(type="text", label="Probabilities"),
	gr.components.Textbox(type="text", label="Raw Scores"),
	gr.components.Textbox(type="text", label="Top Prediction")
	],
	live=False
	)

	iface.launch()