import gradio as gr
from autodistill_clip import CLIP
from autodistill_metaclip import MetaCLIP
from autodistill.detection import CaptionOntology
from PIL import Image
import tempfile

clip_model = CLIP(None)
metaclip_model = MetaCLIP(None)

# create side by side interface

def clip_model_interface(image, text):
    text = text + ", something else"

    with tempfile.NamedTemporaryFile(suffix=".jpg") as temp:
        image = Image.fromarray(image.astype("uint8"), "RGB")

        image.save(temp.name)

        ontology = CaptionOntology(
            {
                t: t for t in text.split(",")
            }
        )

        clip_model.ontology = ontology

        predictions = clip_model.predict(temp.name)

        labels = [text.split(",")[i] for i in predictions.class_id.tolist()]

        confidences = predictions.confidence.tolist()

        return {
            k: v for k, v in zip(labels, confidences)
        }

def metaclip_model_interface(image, text):
    text = text + ", something else"

    with tempfile.NamedTemporaryFile(suffix=".jpg") as temp:
        image = Image.fromarray(image.astype("uint8"), "RGB")

        image.save(temp.name)

        ontology = CaptionOntology(
            {
                t: t for t in text.split(",")
            }
        )

        metaclip_model.ontology = ontology

        predictions = metaclip_model.predict(temp.name, confidence=0)

        labels = [text.split(",")[i] for i in predictions.class_id.tolist()]

        confidences = predictions.confidence.tolist()

        return {
            k: v for k, v in zip(labels, confidences)
        }
    
def combined_model_interface(input_image, input_text):
    # Call the first function
    clip_output = clip_model_interface(input_image, input_text)
    
    # Call the second function
    metaclip_output = metaclip_model_interface(input_image, input_text)
    
    # Return the results from both functions as a tuple
    return clip_output, metaclip_output

inputs = [
    "image",
    "text"
]

outputs = [
    gr.outputs.Label(type="confidences", label="CLIP"),
    gr.outputs.Label(type="confidences", label="MetaCLIP")
]

title = "CLIP vs MetaCLIP"

description = """
CLIP is a zero-shot classification and embedding model developed by OpenAI.

MetaCLIP is a model that uses a CLIP architecture with an open dataset, developed by Meta AI.

Use this space to try out the models and see how they perform on your own images and text.

Note: Due to the way this space was implemented, CLIP will only return the top class. A fix is coming soon.

This project uses the following dependencies:

- [autodistill-clip](https://github.com/autodistill/autodistill-clip)
- [autodistill-metaclip](https://github.com/autodistill/autodistill-metaclip)
"""

gr.Interface(
    fn=combined_model_interface,
    inputs=inputs,
    outputs=outputs,
    title=title,
    description=description,
    allow_flagging=False,
    layout="vertical"
).launch()