File size: 2,510 Bytes
5b8460b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import cv2
from transformers import YolosImageProcessor, YolosForObjectDetection
from PIL import Image
import torch

# Load model and processor
model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")

def process_frame(frame):
    # Resize the frame to reduce processing time
    frame = cv2.resize(frame, (640, 360))  # downscaling the frame
    
    # Convert the frame (numpy array) to PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    # Prepare the image for the model
    inputs = image_processor(images=image, return_tensors="pt")
    
    # Perform object detection
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Post-process the outputs to extract bounding boxes and labels
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0]
    
    # Draw the bounding boxes on the original frame
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        box = [round(i, 2) for i in box.tolist()]
        cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0), 2)
        cv2.putText(frame, f"{model.config.id2label[label.item()]}: {round(score.item(), 2)}", 
                    (int(box[0]), int(box[1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    return frame

def video_object_detection(video):
    cap = cv2.VideoCapture(video)
    processed_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Optionally skip frames to speed up processing
        # if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % 2 == 0:  # Process every 2nd frame
        processed_frame = process_frame(frame)
        processed_frames.append(processed_frame)

    cap.release()

    # Convert processed frames to a video for display
    height, width, _ = processed_frames[0].shape
    output_video = cv2.VideoWriter('/tmp/output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 20, (width, height))

    for frame in processed_frames:
        output_video.write(frame)

    output_video.release()

    return '/tmp/output.mp4'

# Create Gradio interface with live=True
iface = gr.Interface(fn=video_object_detection, inputs="video", outputs="video", title="YOLOs-Tiny Video Detection", live=True)
iface.launch()