import gradio as gr from ultralytics import YOLOv10 import cv2 import torch import os import spaces device = 'cuda' if torch.cuda.is_available() else 'cpu' model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) # Define activity categories based on detected objects activity_categories = { "Working": ["laptop", "computer", "keyboard", "office chair"], "Meal Time": ["fork", "spoon", "plate", "food"], "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], "Outdoors": ["car", "tree", "bicycle", "road"], # Add more categories and objects as needed } # Function to map detected objects to categorized activities def categorize_activity(detected_objects): categorized_activities = {} for activity, objects in activity_categories.items(): if any(obj in detected_objects for obj in objects): if activity not in categorized_activities: categorized_activities[activity] = [] categorized_activities[activity].append(detected_objects) return categorized_activities # Function to process the video, detect objects, and generate a categorized journal with images @spaces.GPU def generate_journal_with_images(video_path): cap = cv2.VideoCapture(video_path) journal_entries = {} saved_images = [] frame_count = 0 output_folder = "detected_frames" os.makedirs(output_folder, exist_ok=True) # Create folder to store images while cap.isOpened(): ret, frame = cap.read() if not ret: break frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Make predictions using YOLOv10 on the current frame results = model.predict(source=frame_rgb, device=device) # Draw bounding boxes on the frame results.render() # Render the results on the image (this modifies the frame in-place) # Save the image with bounding boxes frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") cv2.imwrite(frame_filename, frame_rgb[:, :, ::-1]) # Convert back to BGR for saving saved_images.append(frame_filename) # Extract labels (class indices) and map them to class names detected_objects = [model.names[int(box.cls)] for box in results.boxes] # Get current timestamp in the video timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds # Categorize the detected objects into activities activity_summary = categorize_activity(detected_objects) # Store the activities with their timestamp for activity, objects in activity_summary.items(): if activity not in journal_entries: journal_entries[activity] = [] journal_entries[activity].append((f"At {timestamp:.2f} seconds: {', '.join(objects[0])}", frame_filename)) frame_count += 1 cap.release() # Create a formatted journal output formatted_journal = [] for activity, entries in journal_entries.items(): formatted_journal.append(f"**{activity}:**") for entry, image_path in entries: formatted_journal.append((entry, image_path)) return formatted_journal # Gradio interface for uploading video and generating journal with images def display_journal_with_images(video): journal_with_images = generate_journal_with_images(video) # Create the final display with text and images display_items = [] for entry, image_path in journal_with_images: display_items.append((entry, image_path)) return display_items with gr.Blocks() as iface: video_input = gr.Video(label="Upload Video") output_gallery = gr.Gallery(label="Generated Daily Journal with Images") run_button = gr.Button("Generate Journal") run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=output_gallery) iface.launch()