import gradio as gr from autodistill_gpt_4v import GPT4V from autodistill.detection import CaptionOntology from autodistill_grounding_dino import GroundingDINO from autodistill.utils import plot import tempfile import cv2 from autodistill.core.custom_detection_model import CustomDetectionModel MARKDOWN = """ # DINO-GPT4V Use Grounding DINO and GPT-4V to label specific objects. Visit [awesome-openai-vision-api-experiments](https://github.com/roboflow/awesome-openai-vision-api-experiments) repository to find more OpenAI Vision API experiments or contribute your own.""" def respond(api_key, input_image, dino_prompt, gpt_prompt): input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB) cv2.imwrite("input.jpg", input_image) DINOGPT = CustomDetectionModel( detection_model=GroundingDINO( CaptionOntology({dino_prompt: dino_prompt}) ), classification_model=GPT4V( CaptionOntology({k: k for k in gpt_prompt.split(", ")}), api_key=api_key ) ) results = DINOGPT.predict("input.jpg") result = plot( image=cv2.imread("input.jpg"), detections=results, classes=gpt_prompt.split(", "), raw=True ) return result with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): with gr.Column(): api_key_textbox = gr.Textbox( label="OpenAI API KEY", type="password") dino_prompt = gr.Textbox(label="Grounding DINO Prompt") gpt_prompt = gr.Textbox(label="GPT-4V Prompt") input_image = gr.Image(type="numpy", label="Input Image") with gr.Column(): output_image = gr.Image(type="numpy", label="Output Image") submit_button = gr.Button() submit_button.click( fn=respond, inputs=[api_key_textbox, input_image, dino_prompt, gpt_prompt], outputs=[output_image] ) demo.launch()