compare_clip_siglip

Running

File size: 1,771 Bytes

29afcce
530cb47
 
29afcce
 
 
3e9654b
29afcce
530cb47
29afcce
 
 
 
 
 
 
 
177b4b7
29afcce
73e8cee
 
29afcce
 
 
 
 
 
 
 
73e8cee
29afcce
 
 
 
 
 
 
 
 
 
 
 
 
 
73e8cee
 
 
 
 
 
29afcce
 
 
f3032c4

import torch
from transformers import pipeline, SiglipModel, AutoProcessor
import numpy as np
import gradio as gr


clip_checkpoint = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
clip_detector = pipeline(model=clip_checkpoint, task="zero-shot-image-classification")


def postprocess(output):
  return {out["label"]: float(out["score"]) for out in output}


def infer(image, candidate_labels):
  candidate_labels = [label.lstrip(" ") for label in candidate_labels.split(",")]
  clip_out = clip_detector(image, candidate_labels=candidate_labels)
  return postprocess(clip_out)

def update_top_classes(num_classes):
  return

with gr.Blocks() as demo:
  gr.Markdown("# Compare CLIP and SigLIP")
  gr.Markdown("Compare the performance of CLIP and SigLIP on zero-shot classification in this Space 👇")
  with gr.Row():
    with gr.Column():
        image_input = gr.Image(type="pil")
        text_input = gr.Textbox(label="Input a list of labels")
        slider = gr.Slider(minimum=3, maximum=20, step=1, value=3, label="Number of Top Classes")
        run_button = gr.Button("Run", visible=True)

    with gr.Column():
      clip_output = gr.Label(label = "CLIP Output", num_top_classes=3)
      
  examples = [["./baklava.jpg", "baklava, souffle, tiramisu"]]
  gr.Examples(
        examples = examples, 
        inputs=[image_input, text_input],
        outputs=[clip_output, 
                 ],
        fn=infer,
        cache_examples=True
    )
  slider.change(
        fn=update_top_classes,
        inputs=slider,
        outputs=clip_output,
        _js="(i) => ({ 'num_top_classes': i })"
    )
  run_button.click(fn=infer,
                    inputs=[image_input, text_input],
                    outputs=[clip_output, 
                             ])

demo.launch()