import gradio as gr from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration, TextIteratorStreamer from threading import Thread import re import time from PIL import Image import torch import cv2 import spaces model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" processor = LlavaOnevisionProcessor.from_pretrained(model_id) model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16) model.to("cuda") # Function to capture frames from the camera def capture_camera_frames(num_frames): camera = cv2.VideoCapture(0) # Accessing the camera (0 is the default camera) frames = [] for _ in range(num_frames): ret, frame = camera.read() if not ret: break pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) frames.append(pil_img) camera.release() return frames @spaces.GPU def bot_streaming(message, history): txt = message.text ext_buffer = f"user\n{txt} assistant" if message.files: if len(message.files) == 1: image = [message.files[0].path] elif len(message.files) > 1: image = [msg.path for msg in message.files] else: image = None # Check if we should use the camera if txt.lower().startswith("camera"): # Capture frames from the camera image = capture_camera_frames(5) # Capture 5 frames if message.files is None and not image: gr.Error("You need to upload an image or video, or access the camera for LLaVA to work.") return video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg") image_extensions = Image.registered_extensions() image_extensions = tuple([ex for ex, f in image_extensions.items()]) if len(image) == 1: if image[0].endswith(video_extensions): video = sample_frames(image[0], 32) image = None prompt = f"<|im_start|>user