File size: 2,472 Bytes
897701e
 
 
a0727b2
897701e
 
93fe568
 
897701e
87045f7
a0727b2
 
 
 
 
 
 
 
87045f7
 
 
 
 
a0727b2
897701e
dab8972
a0727b2
dab8972
897701e
 
 
 
dab8972
 
 
 
 
 
 
897701e
 
 
 
 
dab8972
 
897701e
 
 
dab8972
897701e
 
ee68d20
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import gradio as gr
import torchvision.transforms as transforms

# Initialize CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

def image_similarity(image: Image.Image, positive_prompt: str, negative_prompts: str):
    # Convert the PIL Image to a tensor and preprocess
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])
    image_tensor = transform(image).unsqueeze(0)  # Add batch dimension

    # Split the negative prompts string into a list of prompts
    negative_prompts_list = negative_prompts.split(";")
    # Combine positive and negative prompts into one list
    prompts = [positive_prompt.strip()] + [np.strip() for np in negative_prompts_list]

    # Process prompts and image tensor
    inputs = processor(
        text=prompts,
        images=image_tensor,
        return_tensors="pt",
        padding=True
    )

    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)

    # Determine if positive prompt has a higher probability than any of the negative prompts
    is_positive_highest = probs[0][0] > max(probs[0][1:])

    return bool(is_positive_highest), f"Probability for Positive Prompt: {probs[0][0]:.4f}"

interface = gr.Interface(
    fn=image_similarity, 
    inputs=[
        gr.components.Image(type="pil"), 
        gr.components.Text(label="Enter positive prompt e.g. 'a person drinking a beverage'"),
        gr.components.Textbox(label="Enter negative prompts, separated by semicolon e.g. 'an empty scene; person without beverage'", placeholder="negative prompt 1; negative prompt 2; ..."),
    ], 
    outputs=[
        gr.components.Textbox(label="Result"),
        gr.components.Textbox(label="Probability for Positive Prompt")
    ],
    title="Engagify's Image Action Detection",
    description="[Author: Ibrahim Hasani] This Method uses CLIP-VIT [Version: BASE-PATCH-16] to determine if an action is being performed in an image or not. (Binary Classifier). It contrasts an Action against multiple negative labels. Ensure the prompts accurately describe the desired detection.",
    live=False,
    theme=gr.themes.Monochrome(),

)

interface.launch()