Spaces:

AtAndDev
/

DeepSense.ai

Running on Zero

App Files Files Community

DeepSense.ai / app.py

AtAndDev

Update app.py

576f294 verified 4 days ago

raw

history blame contribute delete

6.43 kB

	import os
	os.system("pip install -U ultralytics open_clip_torch numpy opencv-python gradio")

	import spaces
	import gradio as gr
	from ultralytics import YOLO
	import torch
	import open_clip
	import cv2
	import numpy as np
	from PIL import Image

	yolo_model = YOLO("yolov8s-worldv2.pt")
	yolo_model.set_classes(["electric-bike", "electric-bicycle", "e-bike", "bicycle"])

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model, _, preprocess = open_clip.create_model_and_transforms("RN50-quickgelu", pretrained="openai", device=device)
	model.eval()
	tokenizer = open_clip.get_tokenizer("RN50-quickgelu")

	classes = ["an e-bike", "a bicycle", "an electric-bike", "an electric-bicycle"]
	pretty_classes = ["e-bike", "bicycle", "e-bike", "e-bike"]

	def calculate_iou(box1, box2):
	x1_1, y1_1, x2_1, y2_1 = box1
	x1_2, y1_2, x2_2, y2_2 = box2
	xi1 = max(x1_1, x1_2)
	yi1 = max(y1_1, y1_2)
	xi2 = min(x2_1, x2_2)
	yi2 = min(y2_1, y2_2)
	inter_area = max(xi2 - xi1, 0) * max(yi2 - yi1, 0)
	box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
	box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
	union_area = box1_area + box2_area - inter_area
	return inter_area / union_area if union_area != 0 else 0

	def process_frame(frame):
	img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	height, width, _ = frame.shape
	font_scale = max(width, height) / 1000
	thickness = int(max(width, height) / 300)
	results = yolo_model(img_rgb, iou=0.5, conf=0.25)
	boxes, confidences, class_ids = [], [], []

	for result in results:
	for box in result.boxes:
	x1, y1, x2, y2 = map(int, box.xyxy[0])
	confidence = box.conf.item()
	class_id = int(box.cls.item())
	boxes.append((x1, y1, x2, y2))
	confidences.append(confidence)
	class_ids.append(class_id)

	to_remove = set()
	for i in range(len(boxes)):
	for j in range(i + 1, len(boxes)):
	iou = calculate_iou(boxes[i], boxes[j])
	if iou > 0.5:
	if confidences[i] > confidences[j]:
	to_remove.add(j)
	else:
	to_remove.add(i)

	boxes = [box for idx, box in enumerate(boxes) if idx not in to_remove]
	confidences = [conf for idx, conf in enumerate(confidences) if idx not in to_remove]
	class_ids = [cls for idx, cls in enumerate(class_ids) if idx not in to_remove]

	for box, confidence, class_id in zip(boxes, confidences, class_ids):
	x1, y1, x2, y2 = box
	cropped_img = img_rgb[y1:y2, x1:x2]
	image_input = preprocess(Image.fromarray(cropped_img)).unsqueeze(0).to(device)
	text_inputs = tokenizer(classes).to(device)

	with torch.no_grad():
	image_features = model.encode_image(image_input)
	text_features = model.encode_text(text_inputs)
	logits_per_image = image_features @ text_features.T
	probs = logits_per_image.softmax(dim=-1).cpu().numpy()

	predicted_class = pretty_classes[probs.argmax()]
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), thickness)
	label = f"{predicted_class} {probs.max():.2f}"
	cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), thickness)

	return frame

	@spaces.GPU
	def detect_objects_image(image):
	image_np = np.array(image)
	processed_image = process_frame(image_np)
	return Image.fromarray(processed_image)

	@spaces.GPU
	def detect_objects_video(video_file):
	cap = cv2.VideoCapture(video_file)
	frames = []
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	processed_frame = process_frame(frame)
	frames.append(processed_frame)
	cap.release()
	height, width, _ = frames[0].shape
	out = cv2.VideoWriter("/tmp/annotated_video.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height))
	for frame in frames:
	out.write(frame)
	out.release()
	return "/tmp/annotated_video.mp4"

	global_title = "DeepSense.ai"

	file_upload_demo = gr.Interface(
	fn=detect_objects_image,
	inputs=gr.Image(type="pil", label="Input"),
	outputs=gr.Image(type="pil", label="Output"),
	title="Image Upload - Bicycle and E-Bike Detection Model",
	description=(
	"<span style='font-size: 17px;'>"
	"- <b>Step 1</b>: Upload an image, video or live video in the upload box below.<br>"
	"- <b>Step 2</b>: Press <code>Submit</code> and the detection model will label the image or video in the output box."
	"</span>"
	),
	flagging_options=["Incorrect"],
	allow_flagging="manual"
	)

	video_file_demo = gr.Interface(
	fn=detect_objects_video,
	inputs=gr.Video(label="Input"),
	outputs=gr.Video(label="Output"),
	title="Video Upload - Bicycle and E-Bike Detection Model",
	description=(
	"<span style='font-size: 17px;'>"
	"- <b>Step 1</b>: Upload an image, video or live video in the upload box below.<br>"
	"- <b>Step 2</b>: Press <code>Submit</code> and the detection model will label the image or video in the output box."
	"</span>"
	),
	flagging_options=["Incorrect"],
	allow_flagging="manual"
	)

	live_video_demo = gr.Interface(
	fn=detect_objects_image,
	inputs=gr.Image(sources=["webcam"], streaming=True, label="Input"),
	outputs=gr.Image(type="pil", label="Output"),
	live=True,
	title="Live Video - Bicycle and E-Bike Detection Model",
	description=(
	"<span style='font-size: 17px;'>"
	"- <b>Step 1</b>: Upload an image, video or live video in the upload box below.<br>"
	"- <b>Step 2</b>: Press <code>Submit</code> and the detection model will label the image or video in the output box."
	"</span>"
	),
	flagging_options=["Incorrect"],
	allow_flagging="manual"
	)

	with gr.Blocks(title=global_title) as demo:
	gr.Markdown("<h1 style='text-align: center;'>DeepSense.ai</h1>")
	gr.TabbedInterface(
	[file_upload_demo, video_file_demo, live_video_demo],
	["Image Upload", "Video Upload", "Live Video"],
	)
	gr.Markdown(
	"<div style='text-align: center; font-size: 15px; margin-top: 20px;'>"
	"Please press the <code>Flag</code> button under the output box if the label is incorrect, this will help the model with reinforcement learning."
	"</div>"
	)

	if __name__ == "__main__":
	demo.launch(share=True)