# Convert CLIP models to CoreML

In [None]:
!pip install torch transformers coremltools

In [None]:
from transformers import CLIPProcessor, CLIPModel

model_version = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"

processor = CLIPProcessor.from_pretrained(model_version)

# Text model

In [None]:
# wrapped CLIPModel so that forward() function returns get_text_features()
class WrappedCLIPModel_Text(CLIPModel): 
 def forward(self, *args, **kwargs):
 return self.get_text_features(*args, **kwargs)

model_pt_text = WrappedCLIPModel_Text.from_pretrained(model_version)
model_pt_text.eval()

In [None]:
import torch

with torch.no_grad():
 text = "the " + " ".join(["example text"]*37) # 77 tokens
 processed_text = processor(text=text, images=None, return_tensors="pt", padding=True)
 print(len(processed_text.input_ids[0]), processed_text.input_ids)
 model_traced = torch.jit.trace(model_pt_text, processed_text.input_ids, strict=True)

In [None]:
import coremltools as ct
import numpy as np

# Convert traced model to CoreML
text_input_shape = ct.Shape(shape=(1, 77))

model_coreml = ct.convert(
 model_traced,
 inputs=[ct.TensorType(name="input_text_token_ids", shape=text_input_shape, dtype=np.float32)],
 outputs=[ct.TensorType(name="output_embedding", dtype=np.float16)],
 minimum_deployment_target=ct.target.macOS13,
 convert_to='mlprogram'
)

In [None]:
model_coreml.get_spec().description

In [None]:
model_coreml.save("CLIP-ViT-H-14-laion2B-s32B-b79K.text-encoder.mlpackage")

## Check correctness
Should see a mean difference on the order of 1e-5 

In [None]:
import numpy as np
import torch
with torch.no_grad():
 processed_text = processor(text="hello there", images=None, return_tensors="pt", padding=True)
 input_ids = processed_text.input_ids
 input_ids = torch.cat([input_ids, torch.tensor([[49407] * (77-input_ids.shape[1])])], dim=1)
 print("input shape:", input_ids.shape)

 res_pt = model_pt_text(**processed_text)
 print(f"original output: shape {res_pt.shape}, {res_pt}")
 
 coreml_out = model_coreml.predict({'input_text_token_ids': input_ids.float()})
 res_coreml = torch.tensor(coreml_out['output_embedding'])
 print(f"coreml output: shape {res_coreml.shape}, {res_coreml}, type {type(res_coreml)}")
 
 difference = res_pt - res_coreml
 print(f"mean difference: {torch.sum(difference)/difference.shape[1]}, max: {torch.max(difference)}")



# Image encoder

In [None]:
# wrap CLIPModel so that forward() function returns get_image_features()
class WrappedCLIPModel_Image(CLIPModel): 
 def forward(self, *args, **kwargs):
 return self.get_image_features(*args, **kwargs)

model_pt_image = WrappedCLIPModel_Image.from_pretrained(model_version)
model_pt_image.eval()

In [None]:
from PIL import Image
import torch

with torch.no_grad():
 image = Image.open("example.jpg") 
 processed_image = processor(text=None, images=image, return_tensors="pt", padding=True)
 trace_input = torch.rand_like(processed_image.pixel_values)
 model_traced = torch.jit.trace(model_pt_image, trace_input, strict=True)

In [None]:
import coremltools as ct
import numpy as np

# Convert traced model to CoreML
image_input_shape = ct.Shape(shape=trace_input.shape)

model_coreml = ct.convert(
 model_traced,
 inputs=[ct.TensorType(name="input_image_preproessed", shape=image_input_shape, dtype=np.float16)],
 outputs=[ct.TensorType(name="output_embedding", dtype=np.float16)],
 minimum_deployment_target=ct.target.macOS13,
 convert_to='mlprogram'
)

In [None]:
model_coreml.get_spec().description

In [None]:
model_coreml.save("CLIP-ViT-H-14-laion2B-s32B-b79K.image-encoder.mlpackage")

## Check correctness
Should see a mean difference on the order of 1e-5 

In [None]:

with torch.no_grad():
 image = Image.open("example.jpg")

 processed_image = processor(text=None, images=image, return_tensors="pt", padding=True)
 print("input shape:", processed_image.pixel_values.shape)

 res_pt = model_pt_image.get_image_features(**processed_image)
 print(f"original output: shape {res_pt.shape}, {res_pt}")

 coreml_out = model_coreml.predict({'input_image_preproessed': processed_image.pixel_values})
 res_coreml = torch.tensor(coreml_out['output_embedding'])
 print(f"coreml output: shape {res_coreml.shape}, {res_coreml}, type {type(res_coreml)}")

 difference = res_pt - res_coreml
 print(f"mean difference: {torch.sum(difference)/difference.shape[1]}, cosine: {torch.nn.functional.cosine_similarity(res_pt, res_coreml)}, max: {torch.max(difference)}")

# Check performance

In [None]:
import time
from tqdm.auto import tqdm

model_pt_image = model_pt_image.to('mps', dtype=torch.float16)

start = time.perf_counter()
for i in tqdm(range(100)):
 model_pt_image(pixel_values = torch.rand_like(processed_image.pixel_values, device=model_pt_image.device, dtype=torch.float16))
end = time.perf_counter()
print("original (GPU): ", (end-start)/100)

start = time.perf_counter()
for i in tqdm(range(100)):
 model_coreml.predict({'input_image_preproessed': torch.rand_like(processed_image.pixel_values)})
end = time.perf_counter()
print("coreml: ", (end-start)/100)
