Spaces:
Runtime error
Runtime error
from PIL import Image | |
from transformers import CLIPProcessor, CLIPModel | |
import gradio as gr | |
import torchvision.transforms as transforms | |
# Initialize CLIP model and processor | |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
def image_similarity(image: Image.Image, positive_prompt: str, negative_prompts: str): | |
# Convert the PIL Image to a tensor and preprocess | |
transform = transforms.Compose([ | |
transforms.Resize((224, 224)), | |
transforms.ToTensor(), | |
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), | |
]) | |
image_tensor = transform(image).unsqueeze(0) # Add batch dimension | |
# Split the negative prompts string into a list of prompts | |
negative_prompts_list = negative_prompts.split(";") | |
# Combine positive and negative prompts into one list | |
prompts = [positive_prompt.strip()] + [np.strip() for np in negative_prompts_list] | |
# Process prompts and image tensor | |
inputs = processor( | |
text=prompts, | |
images=image_tensor, | |
return_tensors="pt", | |
padding=True | |
) | |
outputs = model(**inputs) | |
logits_per_image = outputs.logits_per_image | |
probs = logits_per_image.softmax(dim=1) | |
# Determine if positive prompt has a higher probability than any of the negative prompts | |
is_positive_highest = probs[0][0] > max(probs[0][1:]) | |
return bool(is_positive_highest), f"Probability for Positive Prompt: {probs[0][0]:.4f}" | |
interface = gr.Interface( | |
fn=image_similarity, | |
inputs=[ | |
gr.components.Image(type="pil"), | |
gr.components.Text(label="Enter positive prompt e.g. 'a person drinking a beverage'"), | |
gr.components.Textbox(label="Enter negative prompts, separated by semicolon e.g. 'an empty scene; person without beverage'", placeholder="negative prompt 1; negative prompt 2; ..."), | |
], | |
outputs=[ | |
gr.components.Textbox(label="Result"), | |
gr.components.Textbox(label="Probability for Positive Prompt") | |
], | |
title="Engagify's Image Action Detection", | |
description="[Author: Ibrahim Hasani] This Method uses CLIP-VIT [Version: BASE-PATCH-16] to determine if an action is being performed in an image or not. (Binary Classifier). It contrasts an Action against multiple negative labels. Ensure the prompts accurately describe the desired detection.", | |
live=False, | |
theme=gr.themes.Monochrome(), | |
) | |
interface.launch() |