import os import cv2 import uuid import gradio as gr import numpy as np import neovision MARKDOWN = """ # Welcome to VisionB 🧠 + 📸 Meet VisionB, your Visual Agent that combines the power of advanced GPT models with real-time visual inputs. Engage in interactive dialogues, ask questions, and gain insights with the added context of images from your webcam. Experience a new dimension of interaction where vision and conversational AI meet. """ connector = neovision.OpanAIConnector() def save_image_to_drive(image: np.ndarray) -> str: image_filename = f"{uuid.uuid4()}.jpeg" image_directory = "data" os.makedirs(image_directory, exist_ok=True) image_path = os.path.join(image_directory, image_filename) cv2.imwrite(image_path, image) return image_path def respond(image: np.ndarray, prompt: str, chat_history=None): # Initialize chat_history as an empty list if it's None if chat_history is None: chat_history = [] image = np.fliplr(image) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image_path = save_image_to_drive(image) response = connector.simple_prompt(image=image, prompt=prompt) chat_history.append(((image_path,), None)) chat_history.append((prompt, response)) return "", chat_history # with gr.Blocks() as demo: # gr.Markdown(MARKDOWN) # with gr.Row(): # webcam = gr.Image(sources=["webcam"], streaming=True, type="numpy") # message = gr.Textbox() # chatbot = gr.Chatbot() # clear_button = gr.Button("Clear") # submit_button = gr.Button("Submit") # submit_button.click( # fn=respond, # inputs=[webcam, message], # outputs=[chatbot] # ) # clear_button.click( # fn=lambda: ("", []), # inputs=[], # outputs=[message, chatbot] # ) # demo.launch(debug=True, show_error=True) with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): webcam = gr.Image(sources=["webcam"], streaming=True) with gr.Column(): chatbot = gr.Chatbot(height=500) message = gr.Textbox() clear_button = gr.ClearButton([message, chatbot]) message.submit(respond, [webcam, message, chatbot], [message, chatbot]) demo.launch(debug=False, show_error=True)