Spaces:

haor
/

clip_aes

Sleeping

App Files Files Community

haor commited on Apr 8, 2024

Commit

d98182a

verified ·

1 Parent(s): edd777f

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -16

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 import hashlib
 import numpy as np
 import cv2
 from PIL import Image
 # if you changed the MLP architecture during training, change it also here:
@@ -49,11 +50,6 @@ def phashstr(image, hash_size=8, highfreq_factor=4):
     diff = dctlowfreq > med
     return _binary_array_to_hex(diff.flatten())
-def normalized(a, axis=-1, order=2):
-    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
-    l2[l2 == 0] = 1
-    return a / np.expand_dims(l2, axis)
 def convert_numpy_types(data):
     if isinstance(data, dict):
         return {key: convert_numpy_types(value) for key, value in data.items()}
@@ -65,15 +61,28 @@ def convert_numpy_types(data):
         return int(data)
     else:
         return data
-def predict(image):
-    model = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14
-    pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
-    model.to(device).eval()
-    model2, preprocess = clip.load("ViT-L/14", device=device)
     image = Image.fromarray(image)
     image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
@@ -81,15 +90,24 @@ def predict(image):
     phash = phashstr(image)
     md5 = hashlib.md5(image.tobytes()).hexdigest()
     sha1 = hashlib.sha1(image.tobytes()).hexdigest()
     inputs = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
         img_emb = model2.encode_image(inputs)
-        img_emb = normalized(img_emb.cpu().numpy())
-        img_emb_tensor = torch.from_numpy(img_emb).to(device).float()
-        prediction = model(img_emb_tensor).item()
     result = {
         "clip_aesthetic": prediction,
         "phash": phash,

 import hashlib
 import numpy as np
 import cv2
+import time
 from PIL import Image
 # if you changed the MLP architecture during training, change it also here:
     diff = dctlowfreq > med
     return _binary_array_to_hex(diff.flatten())
 def convert_numpy_types(data):
     if isinstance(data, dict):
         return {key: convert_numpy_types(value) for key, value in data.items()}
         return int(data)
     else:
         return data
+def normalized_np(a, axis=-1, order=2):
+    import numpy as np  # pylint: disable=import-outside-toplevel
+    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
+    l2[l2 == 0] = 1
+    return a / np.expand_dims(l2, axis)
+def normalized(a, axis=-1, order=2):
+    l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
+    l2[l2 == 0] = 1
+    return a / l2
+model = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14
+pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
+model.to(device).eval()
+model2, preprocess = clip.load("ViT-L/14", device=device)
+def predict(image):
     image = Image.fromarray(image)
     image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
     phash = phashstr(image)
     md5 = hashlib.md5(image.tobytes()).hexdigest()
     sha1 = hashlib.sha1(image.tobytes()).hexdigest()
     inputs = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
+        start_time = time.time()
         img_emb = model2.encode_image(inputs)
+        end_time = time.time()
+        print(f"Encoding image took {end_time - start_time} seconds")
+        start_time = time.time()
+        img_emb = normalized(img_emb).float()
+        end_time = time.time()
+        print(f"Normalizing image took {end_time - start_time} seconds")
+        start_time = time.time()
+        prediction = model(img_emb).item()
+        end_time = time.time()
+        print(f"Making prediction took {end_time - start_time} seconds")
     result = {
         "clip_aesthetic": prediction,
         "phash": phash,