File size: 4,331 Bytes
2996858 d98182a 2996858 b5fd63d 2996858 b5fd63d 2996858 b5fd63d 2996858 b5fd63d 2996858 b5fd63d 2996858 b5fd63d 2996858 b5fd63d 7d44f91 d98182a b5fd63d d98182a b5fd63d d98182a b5fd63d d98182a b5fd63d 2996858 b5fd63d 2996858 b5fd63d d98182a 2996858 d98182a b5fd63d d98182a b5fd63d d98182a 2996858 b5fd63d d98182a b5fd63d 2996858 b5fd63d 2996858 7d44f91 2996858 b5fd63d 2996858 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import gradio as gr
import torch
import torch.nn as nn
import clip
import pandas as pd
import hashlib
import numpy as np
import cv2
import time
from PIL import Image
# MLP model definition
class MLP(nn.Module):
def __init__(self, input_size):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(input_size, 1024),
nn.Dropout(0.2),
nn.Linear(1024, 128),
nn.Dropout(0.2),
nn.Linear(128, 64),
nn.Dropout(0.1),
nn.Linear(64, 16),
nn.Linear(16, 1),
)
def forward(self, x):
return self.layers(x)
# Convert binary array to hexadecimal string
def binary_array_to_hex(arr):
bit_string = ''.join(str(b) for b in 1 * arr.flatten())
width = int(np.ceil(len(bit_string) / 4))
return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
# Calculate perceptual hash of an image
def phash(image, hash_size=8, highfreq_factor=4):
if hash_size < 2:
raise ValueError('Hash size must be greater than or equal to 2')
import scipy.fftpack
img_size = hash_size * highfreq_factor
image = image.convert('L').resize((img_size, img_size), Image.Resampling.LANCZOS)
pixels = np.asarray(image)
dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
dctlowfreq = dct[:hash_size, :hash_size]
med = np.median(dctlowfreq)
diff = dctlowfreq > med
return binary_array_to_hex(diff)
# Convert NumPy types to Python built-in types
def convert_numpy_types(data):
if isinstance(data, dict):
return {key: convert_numpy_types(value) for key, value in data.items()}
elif isinstance(data, list):
return [convert_numpy_types(item) for item in data]
elif isinstance(data, np.float64):
return float(data)
elif isinstance(data, np.int64):
return int(data)
else:
return data
# Normalize tensor
def normalize(a, axis=-1, order=2):
l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
l2[l2 == 0] = 1
return a / l2
# Load pre-trained MLP model and CLIP model
model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
model.to(device).eval()
model2, preprocess = clip.load("ViT-L/14", device=device)
# Predict aesthetic score and other metrics of an image
def predict(image):
# Preprocess image
image = Image.fromarray(image)
image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
phash_value = phash(image)
md5 = hashlib.md5(image.tobytes()).hexdigest()
sha1 = hashlib.sha1(image.tobytes()).hexdigest()
inputs = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
# Extract image features using CLIP model
start_time = time.time()
img_emb = model2.encode_image(inputs)
end_time = time.time()
print(f"Encoding image took {end_time - start_time} seconds")
# Normalize image features
start_time = time.time()
img_emb = normalize(img_emb).float()
end_time = time.time()
print(f"Normalizing image took {end_time - start_time} seconds")
# Predict aesthetic score using MLP model
start_time = time.time()
prediction = model(img_emb).item()
end_time = time.time()
print(f"Making prediction took {end_time - start_time} seconds")
# Return prediction results
result = {
"clip_aesthetic": prediction,
"phash": phash_value,
"md5": md5,
"sha1": sha1,
"laplacian_variance": laplacian_variance
}
return convert_numpy_types(result)
# Create web interface using Gradio
title = "CLIP Aesthetic Score"
description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."
gr.Interface(
fn=predict,
inputs=gr.Image(type="numpy"),
outputs=gr.JSON(label="Result"),
title=title,
description=description,
examples=[["example1.jpg"], ["example2.jpg"]]
).launch() |