haor commited on
Commit
d98182a
·
verified ·
1 Parent(s): edd777f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -16
app.py CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
7
  import hashlib
8
  import numpy as np
9
  import cv2
 
10
  from PIL import Image
11
 
12
  # if you changed the MLP architecture during training, change it also here:
@@ -49,11 +50,6 @@ def phashstr(image, hash_size=8, highfreq_factor=4):
49
  diff = dctlowfreq > med
50
  return _binary_array_to_hex(diff.flatten())
51
 
52
- def normalized(a, axis=-1, order=2):
53
- l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
54
- l2[l2 == 0] = 1
55
- return a / np.expand_dims(l2, axis)
56
-
57
  def convert_numpy_types(data):
58
  if isinstance(data, dict):
59
  return {key: convert_numpy_types(value) for key, value in data.items()}
@@ -65,15 +61,28 @@ def convert_numpy_types(data):
65
  return int(data)
66
  else:
67
  return data
 
 
 
68
 
69
- def predict(image):
70
- model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
71
- pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
72
- device = "cuda" if torch.cuda.is_available() else "cpu"
73
 
74
- model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
75
- model.to(device).eval()
76
- model2, preprocess = clip.load("ViT-L/14", device=device)
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  image = Image.fromarray(image)
79
  image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
@@ -81,15 +90,24 @@ def predict(image):
81
  phash = phashstr(image)
82
  md5 = hashlib.md5(image.tobytes()).hexdigest()
83
  sha1 = hashlib.sha1(image.tobytes()).hexdigest()
84
-
85
  inputs = preprocess(image).unsqueeze(0).to(device)
86
 
87
  with torch.no_grad():
 
88
  img_emb = model2.encode_image(inputs)
89
- img_emb = normalized(img_emb.cpu().numpy())
90
- img_emb_tensor = torch.from_numpy(img_emb).to(device).float()
91
- prediction = model(img_emb_tensor).item()
 
 
 
 
92
 
 
 
 
 
 
93
  result = {
94
  "clip_aesthetic": prediction,
95
  "phash": phash,
 
7
  import hashlib
8
  import numpy as np
9
  import cv2
10
+ import time
11
  from PIL import Image
12
 
13
  # if you changed the MLP architecture during training, change it also here:
 
50
  diff = dctlowfreq > med
51
  return _binary_array_to_hex(diff.flatten())
52
 
 
 
 
 
 
53
  def convert_numpy_types(data):
54
  if isinstance(data, dict):
55
  return {key: convert_numpy_types(value) for key, value in data.items()}
 
61
  return int(data)
62
  else:
63
  return data
64
+
65
+ def normalized_np(a, axis=-1, order=2):
66
+ import numpy as np # pylint: disable=import-outside-toplevel
67
 
68
+ l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
69
+ l2[l2 == 0] = 1
70
+ return a / np.expand_dims(l2, axis)
 
71
 
72
+ def normalized(a, axis=-1, order=2):
73
+ l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
74
+ l2[l2 == 0] = 1
75
+ return a / l2
76
+
77
+
78
+ model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
79
+ pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
80
+ device = "cuda" if torch.cuda.is_available() else "cpu"
81
+ model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
82
+ model.to(device).eval()
83
+ model2, preprocess = clip.load("ViT-L/14", device=device)
84
+
85
+ def predict(image):
86
 
87
  image = Image.fromarray(image)
88
  image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
 
90
  phash = phashstr(image)
91
  md5 = hashlib.md5(image.tobytes()).hexdigest()
92
  sha1 = hashlib.sha1(image.tobytes()).hexdigest()
 
93
  inputs = preprocess(image).unsqueeze(0).to(device)
94
 
95
  with torch.no_grad():
96
+ start_time = time.time()
97
  img_emb = model2.encode_image(inputs)
98
+ end_time = time.time()
99
+ print(f"Encoding image took {end_time - start_time} seconds")
100
+
101
+ start_time = time.time()
102
+ img_emb = normalized(img_emb).float()
103
+ end_time = time.time()
104
+ print(f"Normalizing image took {end_time - start_time} seconds")
105
 
106
+ start_time = time.time()
107
+ prediction = model(img_emb).item()
108
+ end_time = time.time()
109
+ print(f"Making prediction took {end_time - start_time} seconds")
110
+
111
  result = {
112
  "clip_aesthetic": prediction,
113
  "phash": phash,