DailySnap / app.py
yasserrmd's picture
Update app.py
5733ca6 verified
raw
history blame
9.15 kB
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
import markdown
import requests
import torch
import io
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_vision = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time"
def extract_assistant_reply(input_string):
# Define the tag that indicates the start of the assistant's reply
start_tag = "<|start_header_id|>assistant<|end_header_id|>"
# Find the position where the assistant's reply starts
start_index = input_string.find(start_tag)
if start_index == -1:
return "Assistant's reply not found."
start_index += len(start_tag)
# Extract everything after the start tag
assistant_reply = input_string[start_index:].strip()
return assistant_reply
def extract_json_from_markdown(markdown_text):
try:
start_idx = markdown_text.find('```')
end_idx = markdown_text.find('```', start_idx + 3)
if markdown_text[start_idx:start_idx + 7] == '```html':
start_idx += len('```html')
else:
start_idx += len('```')
# Extract and clean up the code block (json or not)
json_str = markdown_text[start_idx:end_idx].strip()
# Try to load it as JSON
return json.loads(json_str)
except Exception as e:
print(f"Error extracting JSON: {e}")
return None
@spaces.GPU
def generate_image_desc(image):
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": SYSTEM_INSTRUCTION}
]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(image, input_text, return_tensors="pt").to(model.device)
# Generate the output from the model
output = model_vision.generate(**inputs, max_new_tokens=300)
print(output)
markdown_text = processor.decode(output[0])
print(markdown_text)
markdown_text=extract_assistant_reply(markdown_text)
html_output = markdown.markdown(markdown_text)
return html_output
@spaces.GPU
def generate_journal_infographics(journal):
prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"
messages = [
{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate infographics using htmnl bootstrap icon and generate highly appealing daily journal as per the user detail"},
{"role": "user", "content": prompt}
]
# Prepare inputs for the model
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generate the documentation
generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(documentation)
return documentation
# Define activity categories based on detected objects
activity_categories = {
"Working": ["laptop", "computer", "keyboard", "office chair"],
"Meal Time": ["fork", "spoon", "plate", "food"],
"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
# Add more categories and objects as needed
}
# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
categorized_activities = {}
for activity, objects in activity_categories.items():
if any(obj in detected_objects for obj in objects):
if activity not in categorized_activities:
categorized_activities[activity] = []
categorized_activities[activity].append(detected_objects)
return categorized_activities
# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
score, _ = ssim(gray_frame1, gray_frame2, full=True)
return score < threshold
# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
cap = cv2.VideoCapture(video_path)
journal_entries = []
image_paths = []
frame_count = 0
output_folder = "detected_frames"
os.makedirs(output_folder, exist_ok=True) # Create folder to store images
last_processed_second = -1 # Keep track of the last processed second
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Get the current timestamp in the video
current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds
current_second = int(current_time) # Round down to the nearest second
# Process only one frame per second
if current_second > last_processed_second:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Make predictions using YOLOv10 on the current frame
results = model.predict(source=frame_rgb, device=device)
# Filter detected objects based on confidence threshold
detected_objects = []
for box in results[0].boxes:
if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8
detected_objects.append(model.names[int(box.cls)])
# Only process frames where objects with confidence >= threshold are detected
if detected_objects: # If there are high-confidence detected objects
# Plot bounding boxes and labels on the image
annotated_frame = results[0].plot() # Plot detection results on the frame
# Save the annotated image
frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving
image_paths.append(frame_filename)
# Categorize the detected objects into activities
activity_summary = categorize_activity(detected_objects)
# Store the activities with their timestamp
for activity, objects in activity_summary.items():
journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
last_processed_second = current_second # Update the last processed second
frame_count += 1
cap.release()
return journal_entries, image_paths
def display_journal_with_images(video):
journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
pil_images = []
for image_path in image_paths:
# Read the image using OpenCV
image = cv2.imread(image_path)
# Convert the image from BGR (OpenCV) to RGB (PIL)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Convert the NumPy array to a PIL image
pil_image = Image.fromarray(image_rgb)
pil_images.append(pil_image)
infograhic_html=''
if len(pil_images) >= 2: #just for mockup
first_frame_detail=generate_image_desc(pil_images[0])
infograhic_html=generate_journal_infographics(first_frame_detail)
journal_text = "\n".join(journal_entries)
return journal_text, image_paths,infograhic_html
with gr.Blocks() as iface:
video_input = gr.Video(label="Upload Video", height=300)
journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
image_gallery = gr.Gallery(label="Annotated Frames")
run_button = gr.Button("Generate Journal")
infographic_html=gr.HTML()
run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])
iface.launch()