File size: 9,149 Bytes
100b9b2 109fee8 100b9b2 d48f382 100b9b2 b7b43b2 80cf856 b7b43b2 80cf856 100b9b2 01e83c8 9c69830 100b9b2 b7b43b2 a516023 b7b43b2 80cf856 a516023 80cf856 b7b43b2 3e35554 b7b43b2 a516023 b7b43b2 80cf856 3e35554 80cf856 3d3237e 80cf856 d48f382 100b9b2 a9777c0 d48f382 100b9b2 d48f382 100b9b2 d48f382 100b9b2 d48f382 100b9b2 8a4dc7e d48f382 100b9b2 15b92fc d48f382 47f97bd 84def21 d48f382 6437c14 100b9b2 6437c14 fd362dd a9777c0 fd362dd a9777c0 6fb0ffb 6437c14 fd362dd 6437c14 d48f382 100b9b2 6437c14 100b9b2 d5b8a83 d48f382 d5b8a83 3e35554 5733ca6 d48f382 84def21 d5b8a83 3e35554 47f97bd 84def21 d48f382 d5b8a83 da6e971 d48f382 3e35554 d48f382 3e35554 d5b8a83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
import markdown
import requests
import torch
import io
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_vision = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time"
def extract_assistant_reply(input_string):
# Define the tag that indicates the start of the assistant's reply
start_tag = "<|start_header_id|>assistant<|end_header_id|>"
# Find the position where the assistant's reply starts
start_index = input_string.find(start_tag)
if start_index == -1:
return "Assistant's reply not found."
start_index += len(start_tag)
# Extract everything after the start tag
assistant_reply = input_string[start_index:].strip()
return assistant_reply
def extract_json_from_markdown(markdown_text):
try:
start_idx = markdown_text.find('```')
end_idx = markdown_text.find('```', start_idx + 3)
if markdown_text[start_idx:start_idx + 7] == '```html':
start_idx += len('```html')
else:
start_idx += len('```')
# Extract and clean up the code block (json or not)
json_str = markdown_text[start_idx:end_idx].strip()
# Try to load it as JSON
return json.loads(json_str)
except Exception as e:
print(f"Error extracting JSON: {e}")
return None
@spaces.GPU
def generate_image_desc(image):
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": SYSTEM_INSTRUCTION}
]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(image, input_text, return_tensors="pt").to(model.device)
# Generate the output from the model
output = model_vision.generate(**inputs, max_new_tokens=300)
print(output)
markdown_text = processor.decode(output[0])
print(markdown_text)
markdown_text=extract_assistant_reply(markdown_text)
html_output = markdown.markdown(markdown_text)
return html_output
@spaces.GPU
def generate_journal_infographics(journal):
prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"
messages = [
{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate infographics using htmnl bootstrap icon and generate highly appealing daily journal as per the user detail"},
{"role": "user", "content": prompt}
]
# Prepare inputs for the model
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generate the documentation
generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(documentation)
return documentation
# Define activity categories based on detected objects
activity_categories = {
"Working": ["laptop", "computer", "keyboard", "office chair"],
"Meal Time": ["fork", "spoon", "plate", "food"],
"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
# Add more categories and objects as needed
}
# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
categorized_activities = {}
for activity, objects in activity_categories.items():
if any(obj in detected_objects for obj in objects):
if activity not in categorized_activities:
categorized_activities[activity] = []
categorized_activities[activity].append(detected_objects)
return categorized_activities
# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
score, _ = ssim(gray_frame1, gray_frame2, full=True)
return score < threshold
# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
cap = cv2.VideoCapture(video_path)
journal_entries = []
image_paths = []
frame_count = 0
output_folder = "detected_frames"
os.makedirs(output_folder, exist_ok=True) # Create folder to store images
last_processed_second = -1 # Keep track of the last processed second
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Get the current timestamp in the video
current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds
current_second = int(current_time) # Round down to the nearest second
# Process only one frame per second
if current_second > last_processed_second:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Make predictions using YOLOv10 on the current frame
results = model.predict(source=frame_rgb, device=device)
# Filter detected objects based on confidence threshold
detected_objects = []
for box in results[0].boxes:
if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8
detected_objects.append(model.names[int(box.cls)])
# Only process frames where objects with confidence >= threshold are detected
if detected_objects: # If there are high-confidence detected objects
# Plot bounding boxes and labels on the image
annotated_frame = results[0].plot() # Plot detection results on the frame
# Save the annotated image
frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving
image_paths.append(frame_filename)
# Categorize the detected objects into activities
activity_summary = categorize_activity(detected_objects)
# Store the activities with their timestamp
for activity, objects in activity_summary.items():
journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
last_processed_second = current_second # Update the last processed second
frame_count += 1
cap.release()
return journal_entries, image_paths
def display_journal_with_images(video):
journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
pil_images = []
for image_path in image_paths:
# Read the image using OpenCV
image = cv2.imread(image_path)
# Convert the image from BGR (OpenCV) to RGB (PIL)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Convert the NumPy array to a PIL image
pil_image = Image.fromarray(image_rgb)
pil_images.append(pil_image)
infograhic_html=''
if len(pil_images) >= 2: #just for mockup
first_frame_detail=generate_image_desc(pil_images[0])
infograhic_html=generate_journal_infographics(first_frame_detail)
journal_text = "\n".join(journal_entries)
return journal_text, image_paths,infograhic_html
with gr.Blocks() as iface:
video_input = gr.Video(label="Upload Video", height=300)
journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
image_gallery = gr.Gallery(label="Annotated Frames")
run_button = gr.Button("Generate Journal")
infographic_html=gr.HTML()
run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])
iface.launch()
|