Spaces:

yasserrmd
/

DailySnap

Sleeping

File size: 9,149 Bytes

import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
import markdown
import requests
import torch
import io
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)



model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model_vision = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)


model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)


SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time"

def extract_assistant_reply(input_string):
    # Define the tag that indicates the start of the assistant's reply
    start_tag = "<|start_header_id|>assistant<|end_header_id|>"
    # Find the position where the assistant's reply starts
    start_index = input_string.find(start_tag)
    if start_index == -1:
        return "Assistant's reply not found."
    start_index += len(start_tag)
    # Extract everything after the start tag
    assistant_reply = input_string[start_index:].strip()
    return assistant_reply

def extract_json_from_markdown(markdown_text):
    try:
        start_idx = markdown_text.find('```')
        end_idx = markdown_text.find('```', start_idx + 3)
        
       
        if markdown_text[start_idx:start_idx + 7] == '```html':
            start_idx += len('```html')
        else:
            start_idx += len('```')

        # Extract and clean up the code block (json or not)
        json_str = markdown_text[start_idx:end_idx].strip()

        # Try to load it as JSON
        return json.loads(json_str)
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return None



@spaces.GPU
def generate_image_desc(image):
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": SYSTEM_INSTRUCTION}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(image, input_text, return_tensors="pt").to(model.device)

    # Generate the output from the model
    output = model_vision.generate(**inputs, max_new_tokens=300)
    print(output)
    markdown_text = processor.decode(output[0])
    print(markdown_text)
        
    markdown_text=extract_assistant_reply(markdown_text)
    html_output = markdown.markdown(markdown_text)
    return html_output

@spaces.GPU
def generate_journal_infographics(journal):
    prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"
    
    messages = [
        {"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate infographics using htmnl bootstrap icon and generate highly appealing daily journal as per the user detail"},
        {"role": "user", "content": prompt}
    ]
    
    # Prepare inputs for the model
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # Generate the documentation
    generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(documentation)
    return documentation



# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
    cap = cv2.VideoCapture(video_path)
    journal_entries = []
    image_paths = []
    frame_count = 0
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images

    last_processed_second = -1  # Keep track of the last processed second
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp in the video
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
        current_second = int(current_time)  # Round down to the nearest second

        # Process only one frame per second
        if current_second > last_processed_second:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Filter detected objects based on confidence threshold
            detected_objects = []
            for box in results[0].boxes:
                if box.conf >= confidence_threshold:  # Only include objects with confidence >= 0.8
                    detected_objects.append(model.names[int(box.cls)])
            
            # Only process frames where objects with confidence >= threshold are detected
            if detected_objects:  # If there are high-confidence detected objects
                
                # Plot bounding boxes and labels on the image
                annotated_frame = results[0].plot()  # Plot detection results on the frame
                
                # Save the annotated image
                frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
                cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
                image_paths.append(frame_filename)
                
                # Categorize the detected objects into activities
                activity_summary = categorize_activity(detected_objects)
                
                # Store the activities with their timestamp
                for activity, objects in activity_summary.items():
                    journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
            
            last_processed_second = current_second  # Update the last processed second
        
        frame_count += 1
    
    cap.release()
    
    return journal_entries, image_paths  


def display_journal_with_images(video):
    journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
    pil_images = []
    for image_path in image_paths:
        # Read the image using OpenCV
        image = cv2.imread(image_path)
        # Convert the image from BGR (OpenCV) to RGB (PIL)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Convert the NumPy array to a PIL image
        pil_image = Image.fromarray(image_rgb)
        pil_images.append(pil_image)

    infograhic_html=''
    
    if len(pil_images) >= 2: #just for mockup
        first_frame_detail=generate_image_desc(pil_images[0]) 
        infograhic_html=generate_journal_infographics(first_frame_detail)
    

    journal_text = "\n".join(journal_entries)
    return journal_text, image_paths,infograhic_html


with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)  
    journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
    image_gallery = gr.Gallery(label="Annotated Frames")
    run_button = gr.Button("Generate Journal")
    infographic_html=gr.HTML()
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])

iface.launch()