import gradio as gr from autodistill_clip import CLIP from autodistill_metaclip import MetaCLIP from autodistill.detection import CaptionOntology from PIL import Image import tempfile clip_model = CLIP(None) metaclip_model = MetaCLIP(None) # create side by side interface def clip_model_interface(image, text): text = text + ", something else" with tempfile.NamedTemporaryFile(suffix=".jpg") as temp: image = Image.fromarray(image.astype("uint8"), "RGB") image.save(temp.name) ontology = CaptionOntology( { t: t for t in text.split(",") } ) clip_model.ontology = ontology predictions = clip_model.predict(temp.name) labels = [text.split(",")[i] for i in predictions.class_id.tolist()] confidences = predictions.confidence.tolist() return { k: v for k, v in zip(labels, confidences) } def metaclip_model_interface(image, text): text = text + ", something else" with tempfile.NamedTemporaryFile(suffix=".jpg") as temp: image = Image.fromarray(image.astype("uint8"), "RGB") image.save(temp.name) ontology = CaptionOntology( { t: t for t in text.split(",") } ) metaclip_model.ontology = ontology predictions = metaclip_model.predict(temp.name, confidence=0) labels = [text.split(",")[i] for i in predictions.class_id.tolist()] confidences = predictions.confidence.tolist() return { k: v for k, v in zip(labels, confidences) } def combined_model_interface(input_image, input_text): # Call the first function clip_output = clip_model_interface(input_image, input_text) # Call the second function metaclip_output = metaclip_model_interface(input_image, input_text) # Return the results from both functions as a tuple return clip_output, metaclip_output inputs = [ "image", "text" ] outputs = [ gr.outputs.Label(type="confidences", label="CLIP"), gr.outputs.Label(type="confidences", label="MetaCLIP") ] title = "CLIP vs MetaCLIP" description = """ CLIP is a zero-shot classification and embedding model developed by OpenAI. MetaCLIP is a model that uses a CLIP architecture with an open dataset, developed by Meta AI. Use this space to try out the models and see how they perform on your own images and text. Note: Due to the way this space was implemented, CLIP will only return the top class. A fix is coming soon. This project uses the following dependencies: - [autodistill-clip](https://github.com/autodistill/autodistill-clip) - [autodistill-metaclip](https://github.com/autodistill/autodistill-metaclip) """ gr.Interface( fn=combined_model_interface, inputs=inputs, outputs=outputs, title=title, description=description, allow_flagging=False, layout="vertical" ).launch()