Spaces:

yasserrmd
/

DailySnap

Sleeping

App Files Files Community

DailySnap / app.py

yasserrmd

Update app.py

e71539b verified 5 months ago

raw

history blame contribute delete

10.4 kB

	import gradio as gr
	from ultralytics import YOLOv10
	from skimage.metrics import structural_similarity as ssim
	import cv2
	import torch
	import os
	import spaces
	import markdown
	import requests
	import torch
	import io
	from PIL import Image
	from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer


	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)



	model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

	model_vision = MllamaForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_id)


	model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
	model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(model_name)


	SYSTEM_INSTRUCTION="You are DailySnap, an intelligent assistant tasked with analyzing images and generating a visually appealing daily journal entry based on the content of the image. Your job is to examine the given image, identify key elements, such as objects, people, and emotions, and then create a narrative that reflects the activities or events captured. The journal should include random timestamps (e.g., 7:15 AM, 1:30 PM) to create a chronological flow, giving the narrative a natural, realistic feel. The final output should be engaging and coherent, combining image analysis with descriptive storytelling to provide users with a meaningful daily journal based on the visual content"

	def extract_assistant_reply(input_string):
	# Define the tag that indicates the start of the assistant's reply
	start_tag = "<\|start_header_id\|>assistant<\|end_header_id\|>"
	# Find the position where the assistant's reply starts
	start_index = input_string.find(start_tag)
	if start_index == -1:
	return "Assistant's reply not found."
	start_index += len(start_tag)
	# Extract everything after the start tag
	assistant_reply = input_string[start_index:].strip()
	return assistant_reply

	def extract_json_from_markdown(markdown_text):
	try:
	start_idx = markdown_text.find('```')
	end_idx = markdown_text.find('```', start_idx + 3)


	if markdown_text[start_idx:start_idx + 7] == '```html':
	start_idx += len('```html')
	else:
	start_idx += len('```')

	# Extract and clean up the code block (json or not)
	json_str = markdown_text[start_idx:end_idx].strip()

	# Try to load it as JSON
	return json.loads(json_str)
	except Exception as e:
	print(f"Error extracting JSON: {e}")
	return None



	@spaces.GPU
	def generate_image_desc(image):
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": SYSTEM_INSTRUCTION}
	]}
	]
	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(image, input_text, return_tensors="pt").to(model.device)

	# Generate the output from the model
	output = model_vision.generate(**inputs, max_new_tokens=300)
	print(output)
	markdown_text = processor.decode(output[0])
	print(markdown_text)

	markdown_text=extract_assistant_reply(markdown_text)
	html_output = markdown.markdown(markdown_text)
	return html_output

	@spaces.GPU
	def generate_journal_infographics(journal):
	prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"

	messages = [
	{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate visually appealing daily journals and infographics. Your primary function is to transform user-provided details into structured and aesthetically engaging content. When generating infographics, you must use HTML and Bootstrap icons to create visually compelling and clear representations of the user's data or ideas. For daily journals, you should organize the information into an appealing, easy-to-read format that incorporates icons, headings, and layouts based on the user’s preferences. Your designs should always focus on clarity, creativity, and user-centric formatting, ensuring the final product is both functional and visually engaging. Your ultimate goal is to help users effortlessly convert their daily activities and narratives into attractive visual content with minimal guidance."},
	{"role": "user", "content": prompt}
	]

	# Prepare inputs for the model
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# Generate the documentation
	generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
	generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
	documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	print(documentation)
	return documentation



	# Define activity categories based on detected objects
	activity_categories = {
	"Working": ["laptop", "computer", "keyboard", "office chair"],
	"Meal Time": ["fork", "spoon", "plate", "food"],
	"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
	"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
	# Add more categories and objects as needed
	}

	# Function to map detected objects to categorized activities
	def categorize_activity(detected_objects):
	categorized_activities = {}

	for activity, objects in activity_categories.items():
	if any(obj in detected_objects for obj in objects):
	if activity not in categorized_activities:
	categorized_activities[activity] = []
	categorized_activities[activity].append(detected_objects)

	return categorized_activities


	# Function to compare frames using SSIM to avoid repeated frames
	def is_frame_different(frame1, frame2, threshold=0.9):
	gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
	gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
	score, _ = ssim(gray_frame1, gray_frame2, full=True)
	return score < threshold


	# Function to process the video, detect objects, and generate a categorized journal with images
	@spaces.GPU
	def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
	cap = cv2.VideoCapture(video_path)
	journal_entries = []
	image_paths = []
	frame_count = 0
	output_folder = "detected_frames"
	os.makedirs(output_folder, exist_ok=True) # Create folder to store images

	last_processed_second = -1 # Keep track of the last processed second

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	# Get the current timestamp in the video
	current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds
	current_second = int(current_time) # Round down to the nearest second

	# Process only one frame per second
	if current_second > last_processed_second:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

	# Make predictions using YOLOv10 on the current frame
	results = model.predict(source=frame_rgb, device=device)

	# Filter detected objects based on confidence threshold
	detected_objects = []
	for box in results[0].boxes:
	if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8
	detected_objects.append(model.names[int(box.cls)])

	# Only process frames where objects with confidence >= threshold are detected
	if detected_objects: # If there are high-confidence detected objects

	# Plot bounding boxes and labels on the image
	annotated_frame = results[0].plot() # Plot detection results on the frame

	# Save the annotated image
	frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
	cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving
	image_paths.append(frame_filename)

	# Categorize the detected objects into activities
	activity_summary = categorize_activity(detected_objects)

	# Store the activities with their timestamp
	for activity, objects in activity_summary.items():
	journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")

	last_processed_second = current_second # Update the last processed second

	frame_count += 1

	cap.release()

	return journal_entries, image_paths


	def display_journal_with_images(video):
	journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
	pil_images = []
	for image_path in image_paths:
	# Read the image using OpenCV
	image = cv2.imread(image_path)
	# Convert the image from BGR (OpenCV) to RGB (PIL)
	image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	# Convert the NumPy array to a PIL image
	pil_image = Image.fromarray(image_rgb)
	pil_images.append(pil_image)

	infograhic_html=''

	if len(pil_images) >= 2: #just for mockup
	first_frame_detail=generate_image_desc(pil_images[0])
	infograhic_html=generate_journal_infographics(first_frame_detail)


	journal_text = "\n".join(journal_entries)
	return journal_text, image_paths,infograhic_html


	with gr.Blocks() as iface:
	video_input = gr.Video(label="Upload Video", height=300)
	journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
	image_gallery = gr.Gallery(label="Annotated Frames")
	run_button = gr.Button("Generate Journal")
	infographic_html=gr.HTML()

	run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])

	iface.launch()