CLIPScore / app.py
taesiri's picture
Update app.py
73f9f45 verified
raw
history blame
1.81 kB
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
@spaces.GPU
def calculate_score(image, text):
labels = text.split(";")
labels = [l.strip() for l in labels]
labels = list(filter(None, labels))
if len(labels) == 0:
return dict()
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image.detach().cpu().numpy()
results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
return results_dict
with gr.Blocks() as demo:
gr.Markdown("# CLIP Score")
gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text")
with gr.Row():
image_input = gr.Image()
output_label = gr.Label()
text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
image_input.change(
fn=calculate_score,
inputs=[image_input, text_input],
outputs=output_label
)
text_input.submit(
fn=calculate_score,
inputs=[image_input, text_input],
outputs=output_label
)
gr.Examples(
examples=[
[
"cat.jpg",
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
]
],
fn=calculate_score,
inputs=[image_input, text_input],
outputs=output_label,
)
demo.launch()