import gradio as gr import torch print(f"Is CUDA available: {torch.cuda.is_available()}") # True print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # Tesla T4 import numpy as np from transformers import AutoProcessor, AutoModel from PIL import Image from decord import VideoReader, gpu def sample_uniform_frame_indices(clip_len, seg_len): """ Samples `clip_len` uniformly spaced frame indices from a video of length `seg_len`. Handles edge cases where `seg_len` might be less than `clip_len`. """ if seg_len < clip_len: repeat_factor = np.ceil(clip_len / seg_len).astype(int) indices = np.arange(seg_len).tolist() * repeat_factor indices = indices[:clip_len] else: spacing = seg_len // clip_len indices = [i * spacing for i in range(clip_len)] return np.array(indices).astype(np.int64) def read_video_decord(file_path, indices): vr = VideoReader(file_path, num_threads=1, ctx=gpu(0)) video = vr.get_batch(indices).asnumpy() return video def concatenate_frames(frames, clip_len): assert len(frames) == clip_len, f"The function expects {clip_len} frames as input." layout = { 32: (4, 8), 16: (4, 4), 8: (2, 4) } rows, cols = layout[clip_len] combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows)) frame_iter = iter(frames) y_offset = 0 for i in range(rows): x_offset = 0 for j in range(cols): img = Image.fromarray(next(frame_iter)) combined_image.paste(img, (x_offset, y_offset)) x_offset += frames[0].shape[1] y_offset += frames[0].shape[0] return combined_image def model_interface(uploaded_video, model_choice, activities): clip_len = { "microsoft/xclip-base-patch16-zero-shot": 32, "microsoft/xclip-base-patch32-16-frames": 16, "microsoft/xclip-base-patch32": 8 }.get(model_choice, 32) indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video))) video = read_video_decord(uploaded_video, indices) concatenated_image = concatenate_frames(video, clip_len) # Passed clip_len as argument processor = AutoProcessor.from_pretrained(model_choice) model = AutoModel.from_pretrained(model_choice) model = model.to("cuda") activities_list = activities.split(",") inputs = processor( text=activities_list, videos=list(video), return_tensors="pt", padding=True, ) with torch.no_grad(): outputs = model(**inputs) logits_per_video = outputs.logits_per_video probs = logits_per_video.softmax(dim=1) results_probs = [] results_logits = [] for i in range(len(activities_list)): activity = activities_list[i] prob = float(probs[0][i]) logit = float(logits_per_video[0][i]) results_probs.append((activity, f"Probability: {prob * 100:.2f}%")) results_logits.append((activity, f"Raw Score: {logit:.2f}")) # Retrieve most likely predicted label and its probability max_prob_idx = probs[0].argmax().item() most_likely_activity = activities_list[max_prob_idx] most_likely_prob = float(probs[0][max_prob_idx]) return concatenated_image, results_probs, results_logits, (most_likely_activity, f"Probability: {most_likely_prob * 100:.2f}%") iface = gr.Interface( fn=model_interface, inputs=[ gr.components.Video(label="Upload a video file"), gr.components.Dropdown(choices=[ "microsoft/xclip-base-patch16-zero-shot", "microsoft/xclip-base-patch32-16-frames", "microsoft/xclip-base-patch32" ], label="Model Choice"), gr.components.Textbox(lines=4, label="Enter activities (comma-separated)"), ], outputs=[ gr.components.Image(type="pil", label="sampled frames"), gr.components.Textbox(type="text", label="Probabilities"), gr.components.Textbox(type="text", label="Raw Scores"), gr.components.Textbox(type="text", label="Most Likely Prediction") ], live=False ) iface.launch()