File size: 4,352 Bytes
2996858
 
 
 
 
 
 
 
d98182a
2996858
 
b5fd63d
2996858
b5fd63d
2996858
 
b5fd63d
2996858
 
 
 
 
 
 
 
 
 
 
 
b5fd63d
 
2996858
 
 
 
b5fd63d
 
2996858
 
 
 
 
 
 
 
 
 
 
b5fd63d
2996858
b5fd63d
7d44f91
 
 
 
 
 
 
 
 
 
 
d98182a
b5fd63d
 
d98182a
 
 
 
b5fd63d
d98182a
 
 
 
 
 
 
b5fd63d
d98182a
b5fd63d
2996858
 
 
b5fd63d
2996858
 
 
 
 
b5fd63d
d98182a
2996858
d98182a
 
 
b5fd63d
d98182a
b5fd63d
d98182a
 
2996858
b5fd63d
d98182a
 
 
 
 
b5fd63d
2996858
 
b5fd63d
2996858
 
 
 
7d44f91
2996858
b5fd63d
2996858
 
 
 
 
 
 
 
 
 
ca44171
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import torch
import torch.nn as nn
import clip
import pandas as pd
import hashlib
import numpy as np
import cv2
import time
from PIL import Image

# MLP model definition
class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.Dropout(0.1),
            nn.Linear(64, 16),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        return self.layers(x)

# Convert binary array to hexadecimal string
def binary_array_to_hex(arr):
    bit_string = ''.join(str(b) for b in 1 * arr.flatten())
    width = int(np.ceil(len(bit_string) / 4))
    return '{:0>{width}x}'.format(int(bit_string, 2), width=width)

# Calculate perceptual hash of an image
def phash(image, hash_size=8, highfreq_factor=4):
    if hash_size < 2:
        raise ValueError('Hash size must be greater than or equal to 2')

    import scipy.fftpack
    img_size = hash_size * highfreq_factor
    image = image.convert('L').resize((img_size, img_size), Image.Resampling.LANCZOS)
    pixels = np.asarray(image)
    dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
    dctlowfreq = dct[:hash_size, :hash_size]
    med = np.median(dctlowfreq)
    diff = dctlowfreq > med
    return binary_array_to_hex(diff)

# Convert NumPy types to Python built-in types
def convert_numpy_types(data):
    if isinstance(data, dict):
        return {key: convert_numpy_types(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_numpy_types(item) for item in data]
    elif isinstance(data, np.float64):
        return float(data)
    elif isinstance(data, np.int64):
        return int(data)
    else:
        return data
    
# Normalize tensor
def normalize(a, axis=-1, order=2):
    l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
    l2[l2 == 0] = 1
    return a / l2

# Load pre-trained MLP model and CLIP model
model = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14
pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
model.to(device).eval()
model2, preprocess = clip.load("ViT-L/14", device=device)

# Predict aesthetic score and other metrics of an image
def predict(image):
    # Preprocess image
    image = Image.fromarray(image)  
    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
    phash_value = phash(image)
    md5 = hashlib.md5(image.tobytes()).hexdigest()
    sha1 = hashlib.sha1(image.tobytes()).hexdigest()
    inputs = preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        # Extract image features using CLIP model
        start_time = time.time()
        img_emb = model2.encode_image(inputs)
        end_time = time.time()
        print(f"Encoding image took {end_time - start_time} seconds")

        # Normalize image features
        start_time = time.time()
        img_emb = normalize(img_emb).float()
        end_time = time.time()
        print(f"Normalizing image took {end_time - start_time} seconds")

        # Predict aesthetic score using MLP model
        start_time = time.time()
        prediction = model(img_emb).item()
        end_time = time.time()
        print(f"Making prediction took {end_time - start_time} seconds")
        
    # Return prediction results
    result = {
        "clip_aesthetic": prediction,
        "phash": phash_value,
        "md5": md5, 
        "sha1": sha1,
        "laplacian_variance": laplacian_variance
    }
    return convert_numpy_types(result)

# Create web interface using Gradio
title = "CLIP Aesthetic Score"
description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."

gr.Interface(
    fn=predict, 
    inputs=gr.Image(type="numpy"),
    outputs=gr.JSON(label="Result"),
    title=title,
    description=description,
    examples=[["example1.jpg"], ["example2.jpg"]]
).launch(server_name='0.0.0.0')