|
import gradio as gr |
|
from ultralytics import YOLOv10 |
|
from skimage.metrics import structural_similarity as ssim |
|
import cv2 |
|
import torch |
|
import os |
|
import spaces |
|
import markdown |
|
import requests |
|
import torch |
|
import io |
|
from PIL import Image |
|
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) |
|
|
|
|
|
|
|
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" |
|
|
|
model_vision = MllamaForConditionalGeneration.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
|
|
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" |
|
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time" |
|
|
|
def extract_assistant_reply(input_string): |
|
|
|
start_tag = "<|start_header_id|>assistant<|end_header_id|>" |
|
|
|
start_index = input_string.find(start_tag) |
|
if start_index == -1: |
|
return "Assistant's reply not found." |
|
start_index += len(start_tag) |
|
|
|
assistant_reply = input_string[start_index:].strip() |
|
return assistant_reply |
|
|
|
def extract_json_from_markdown(markdown_text): |
|
try: |
|
start_idx = markdown_text.find('```') |
|
end_idx = markdown_text.find('```', start_idx + 3) |
|
|
|
|
|
if markdown_text[start_idx:start_idx + 7] == '```html': |
|
start_idx += len('```html') |
|
else: |
|
start_idx += len('```') |
|
|
|
|
|
json_str = markdown_text[start_idx:end_idx].strip() |
|
|
|
|
|
return json.loads(json_str) |
|
except Exception as e: |
|
print(f"Error extracting JSON: {e}") |
|
return None |
|
|
|
|
|
|
|
@spaces.GPU |
|
def generate_image_desc(image): |
|
messages = [ |
|
{"role": "user", "content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": SYSTEM_INSTRUCTION} |
|
]} |
|
] |
|
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
inputs = processor(image, input_text, return_tensors="pt").to(model.device) |
|
|
|
|
|
output = model_vision.generate(**inputs, max_new_tokens=300) |
|
print(output) |
|
markdown_text = processor.decode(output[0]) |
|
print(markdown_text) |
|
|
|
markdown_text=extract_assistant_reply(markdown_text) |
|
html_output = markdown.markdown(markdown_text) |
|
return html_output |
|
|
|
@spaces.GPU |
|
def generate_journal_infographics(journal): |
|
prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}" |
|
|
|
messages = [ |
|
{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate infographics using htmnl bootstrap icon and generate highly appealing daily journal as per the user detail"}, |
|
{"role": "user", "content": prompt} |
|
] |
|
|
|
|
|
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
|
|
generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000) |
|
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] |
|
documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
print(documentation) |
|
return documentation |
|
|
|
|
|
|
|
|
|
activity_categories = { |
|
"Working": ["laptop", "computer", "keyboard", "office chair"], |
|
"Meal Time": ["fork", "spoon", "plate", "food"], |
|
"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], |
|
"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"], |
|
|
|
} |
|
|
|
|
|
def categorize_activity(detected_objects): |
|
categorized_activities = {} |
|
|
|
for activity, objects in activity_categories.items(): |
|
if any(obj in detected_objects for obj in objects): |
|
if activity not in categorized_activities: |
|
categorized_activities[activity] = [] |
|
categorized_activities[activity].append(detected_objects) |
|
|
|
return categorized_activities |
|
|
|
|
|
|
|
def is_frame_different(frame1, frame2, threshold=0.9): |
|
gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) |
|
gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) |
|
score, _ = ssim(gray_frame1, gray_frame2, full=True) |
|
return score < threshold |
|
|
|
|
|
|
|
@spaces.GPU |
|
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8): |
|
cap = cv2.VideoCapture(video_path) |
|
journal_entries = [] |
|
image_paths = [] |
|
frame_count = 0 |
|
output_folder = "detected_frames" |
|
os.makedirs(output_folder, exist_ok=True) |
|
|
|
last_processed_second = -1 |
|
|
|
while cap.isOpened(): |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
|
|
|
|
current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 |
|
current_second = int(current_time) |
|
|
|
|
|
if current_second > last_processed_second: |
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
results = model.predict(source=frame_rgb, device=device) |
|
|
|
|
|
detected_objects = [] |
|
for box in results[0].boxes: |
|
if box.conf >= confidence_threshold: |
|
detected_objects.append(model.names[int(box.cls)]) |
|
|
|
|
|
if detected_objects: |
|
|
|
|
|
annotated_frame = results[0].plot() |
|
|
|
|
|
frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") |
|
cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) |
|
image_paths.append(frame_filename) |
|
|
|
|
|
activity_summary = categorize_activity(detected_objects) |
|
|
|
|
|
for activity, objects in activity_summary.items(): |
|
journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}") |
|
|
|
last_processed_second = current_second |
|
|
|
frame_count += 1 |
|
|
|
cap.release() |
|
|
|
return journal_entries, image_paths |
|
|
|
|
|
def display_journal_with_images(video): |
|
journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30) |
|
pil_images = [] |
|
for image_path in image_paths: |
|
|
|
image = cv2.imread(image_path) |
|
|
|
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
|
|
|
pil_image = Image.fromarray(image_rgb) |
|
pil_images.append(pil_image) |
|
|
|
infograhic_html='' |
|
|
|
if len(pil_images) >= 2: |
|
first_frame_detail=generate_image_desc(pil_images[0]) |
|
infograhic_html=generate_journal_infographics(first_frame_detail) |
|
|
|
|
|
journal_text = "\n".join(journal_entries) |
|
return journal_text, image_paths,infograhic_html |
|
|
|
|
|
with gr.Blocks() as iface: |
|
video_input = gr.Video(label="Upload Video", height=300) |
|
journal_output = gr.Textbox(label="Generated Daily Journal", lines=10) |
|
image_gallery = gr.Gallery(label="Annotated Frames") |
|
run_button = gr.Button("Generate Journal") |
|
infographic_html=gr.HTML() |
|
|
|
run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html]) |
|
|
|
iface.launch() |
|
|