Spaces:
Runtime error
Runtime error
File size: 3,321 Bytes
2b6ee25 4ee3bfd c9ec901 bd30780 c9ec901 8edfc8f c9ec901 602c097 c9ec901 15fb875 c9ec901 9e60d26 2c40e64 6543717 2f4e6e2 43d1c17 c9ec901 602c097 02e7400 602c097 c6abcb0 602c097 c9ec901 bd30780 ee6c7f2 6da126e bd30780 052bdf5 bd30780 291e602 32e5db8 c9ec901 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import os
os.system("pip freeze")
import cv2
from PIL import Image
import clip
import torch
import math
import numpy as np
import torch
import datetime
import gradio as gr
# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
def inference(video, text):
# The frame images will be stored in video_frames
video_frames = []
# Open the video file
capture = cv2.VideoCapture(video)
fps = capture.get(cv2.CAP_PROP_FPS)
current_frame = 0
# Read the current frame
ret, frame = capture.read()
while capture.isOpened() and ret:
ret,frame = capture.read()
print('Read a new frame: ', ret)
current_frame += 1
if ret:
video_frames.append(Image.fromarray(frame[:, :, ::-1]))
# Print some statistics
print(f"Frames extracted: {len(video_frames)}")
# You can try tuning the batch size for very large videos, but it should usually be OK
batch_size = 256
batches = math.ceil(len(video_frames) / batch_size)
# The encoded features will bs stored in video_features
video_features = torch.empty([0, 512], dtype=torch.float16).to(device)
# Process each batch
for i in range(batches):
print(f"Processing batch {i+1}/{batches}")
# Get the relevant frames
batch_frames = video_frames[i*batch_size : (i+1)*batch_size]
# Preprocess the images for the batch
batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
# Encode with CLIP and normalize
with torch.no_grad():
batch_features = model.encode_image(batch_preprocessed)
batch_features /= batch_features.norm(dim=-1, keepdim=True)
# Append the batch to the list containing all features
video_features = torch.cat((video_features, batch_features))
# Print some stats
print(f"Features: {video_features.shape}")
search_query=text
display_heatmap=False
display_results_count=1
# Encode and normalize the search query using CLIP
with torch.no_grad():
text_features = model.encode_text(clip.tokenize(search_query).to(device))
text_features /= text_features.norm(dim=-1, keepdim=True)
# Compute the similarity between the search query and each frame using the Cosine similarity
similarities = (100.0 * video_features @ text_features.T)
values, best_photo_idx = similarities.topk(display_results_count, dim=0)
for frame_id in best_photo_idx:
frame = video_frames[frame_id]
# Find the timestamp in the video and display it
seconds = round(frame_id.cpu().numpy()[0]/fps)
return frame,f"Found at {str(datetime.timedelta(seconds=seconds))}"
title = "Video Search"
description = "Gradio demo for using OpenAI's CLIP to search inside videos. To use it, simply upload your video and add your text. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/haltakov/natural-language-youtube-search' target='_blank'>Github Repo</a></p>"
examples=[['test.mp4',"gas station"]]
gr.Interface(
inference,
["video","text"],
[gr.outputs.Image(type="pil", label="Output"),"text"],
title=title,
description=description,
article=article,
examples=examples
).launch(debug=True,enable_queue=True)
|