|
import gradio as gr |
|
from ultralytics import YOLOv10 |
|
from skimage.metrics import structural_similarity as ssim |
|
import cv2 |
|
import torch |
|
import os |
|
import spaces |
|
import markdown |
|
import requests |
|
import torch |
|
import io |
|
from PIL import Image |
|
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) |
|
|
|
|
|
|
|
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" |
|
|
|
model_vision = MllamaForConditionalGeneration.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
|
|
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" |
|
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
SYSTEM_INSTRUCTION="You are DailySnap, an intelligent assistant tasked with analyzing images and generating a visually appealing daily journal entry based on the content of the image. Your job is to examine the given image, identify key elements, such as objects, people, and emotions, and then create a narrative that reflects the activities or events captured. The journal should include random timestamps (e.g., 7:15 AM, 1:30 PM) to create a chronological flow, giving the narrative a natural, realistic feel. The final output should be engaging and coherent, combining image analysis with descriptive storytelling to provide users with a meaningful daily journal based on the visual content" |
|
|
|
def extract_assistant_reply(input_string): |
|
|
|
start_tag = "<|start_header_id|>assistant<|end_header_id|>" |
|
|
|
start_index = input_string.find(start_tag) |
|
if start_index == -1: |
|
return "Assistant's reply not found." |
|
start_index += len(start_tag) |
|
|
|
assistant_reply = input_string[start_index:].strip() |
|
return assistant_reply |
|
|
|
def extract_json_from_markdown(markdown_text): |
|
try: |
|
start_idx = markdown_text.find('```') |
|
end_idx = markdown_text.find('```', start_idx + 3) |
|
|
|
|
|
if markdown_text[start_idx:start_idx + 7] == '```html': |
|
start_idx += len('```html') |
|
else: |
|
start_idx += len('```') |
|
|
|
|
|
json_str = markdown_text[start_idx:end_idx].strip() |
|
|
|
|
|
return json.loads(json_str) |
|
except Exception as e: |
|
print(f"Error extracting JSON: {e}") |
|
return None |
|
|
|
|
|
|
|
@spaces.GPU |
|
def generate_image_desc(image): |
|
messages = [ |
|
{"role": "user", "content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": SYSTEM_INSTRUCTION} |
|
]} |
|
] |
|
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
inputs = processor(image, input_text, return_tensors="pt").to(model.device) |
|
|
|
|
|
output = model_vision.generate(**inputs, max_new_tokens=300) |
|
print(output) |
|
markdown_text = processor.decode(output[0]) |
|
print(markdown_text) |
|
|
|
markdown_text=extract_assistant_reply(markdown_text) |
|
html_output = markdown.markdown(markdown_text) |
|
return html_output |
|
|
|
@spaces.GPU |
|
def generate_journal_infographics(journal): |
|
prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}" |
|
|
|
messages = [ |
|
{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate visually appealing daily journals and infographics. Your primary function is to transform user-provided details into structured and aesthetically engaging content. When generating infographics, you must use HTML and Bootstrap icons to create visually compelling and clear representations of the user's data or ideas. For daily journals, you should organize the information into an appealing, easy-to-read format that incorporates icons, headings, and layouts based on the user’s preferences. Your designs should always focus on clarity, creativity, and user-centric formatting, ensuring the final product is both functional and visually engaging. Your ultimate goal is to help users effortlessly convert their daily activities and narratives into attractive visual content with minimal guidance."}, |
|
{"role": "user", "content": prompt} |
|
] |
|
|
|
|
|
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
|
|
generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000) |
|
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] |
|
documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
print(documentation) |
|
return documentation |
|
|
|
|
|
|
|
|
|
activity_categories = { |
|
"Working": ["laptop", "computer", "keyboard", "office chair"], |
|
"Meal Time": ["fork", "spoon", "plate", "food"], |
|
"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], |
|
"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"], |
|
|
|
} |
|
|
|
|
|
def categorize_activity(detected_objects): |
|
categorized_activities = {} |
|
|
|
for activity, objects in activity_categories.items(): |
|
if any(obj in detected_objects for obj in objects): |
|
if activity not in categorized_activities: |
|
categorized_activities[activity] = [] |
|
categorized_activities[activity].append(detected_objects) |
|
|
|
return categorized_activities |
|
|
|
|
|
|
|
def is_frame_different(frame1, frame2, threshold=0.9): |
|
gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) |
|
gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) |
|
score, _ = ssim(gray_frame1, gray_frame2, full=True) |
|
return score < threshold |
|
|
|
|
|
|
|
@spaces.GPU |
|
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8): |
|
cap = cv2.VideoCapture(video_path) |
|
journal_entries = [] |
|
image_paths = [] |
|
frame_count = 0 |
|
output_folder = "detected_frames" |
|
os.makedirs(output_folder, exist_ok=True) |
|
|
|
last_processed_second = -1 |
|
|
|
while cap.isOpened(): |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
|
|
|
|
current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 |
|
current_second = int(current_time) |
|
|
|
|
|
if current_second > last_processed_second: |
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
results = model.predict(source=frame_rgb, device=device) |
|
|
|
|
|
detected_objects = [] |
|
for box in results[0].boxes: |
|
if box.conf >= confidence_threshold: |
|
detected_objects.append(model.names[int(box.cls)]) |
|
|
|
|
|
if detected_objects: |
|
|
|
|
|
annotated_frame = results[0].plot() |
|
|
|
|
|
frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") |
|
cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) |
|
image_paths.append(frame_filename) |
|
|
|
|
|
activity_summary = categorize_activity(detected_objects) |
|
|
|
|
|
for activity, objects in activity_summary.items(): |
|
journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}") |
|
|
|
last_processed_second = current_second |
|
|
|
frame_count += 1 |
|
|
|
cap.release() |
|
|
|
return journal_entries, image_paths |
|
|
|
|
|
def display_journal_with_images(video): |
|
journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30) |
|
pil_images = [] |
|
for image_path in image_paths: |
|
|
|
image = cv2.imread(image_path) |
|
|
|
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
|
|
|
pil_image = Image.fromarray(image_rgb) |
|
pil_images.append(pil_image) |
|
|
|
infograhic_html='' |
|
|
|
if len(pil_images) >= 2: |
|
first_frame_detail=generate_image_desc(pil_images[0]) |
|
infograhic_html=generate_journal_infographics(first_frame_detail) |
|
|
|
|
|
journal_text = "\n".join(journal_entries) |
|
return journal_text, image_paths,infograhic_html |
|
|
|
|
|
with gr.Blocks() as iface: |
|
video_input = gr.Video(label="Upload Video", height=300) |
|
journal_output = gr.Textbox(label="Generated Daily Journal", lines=10) |
|
image_gallery = gr.Gallery(label="Annotated Frames") |
|
run_button = gr.Button("Generate Journal") |
|
infographic_html=gr.HTML() |
|
|
|
run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html]) |
|
|
|
iface.launch() |
|
|