from typing import Dict, List, Any from transformers import CLIPModel, AutoProcessor, AutoTokenizer import torch from PIL import Image import requests class EndpointHandler: def __init__(self): self.model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip") self.processor = AutoProcessor.from_pretrained("patrickjohncyh/fashion-clip") self.tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: parameters = data.pop("parameters", {"mode": "image"}) inputs = data.pop("inputs", data) with torch.no_grad(): if parameters["mode"] == "text": inputs = self.tokenizer(inputs, padding=True, return_tensors="pt") features = self.model.get_text_features(**inputs) if parameters["mode"] == "image": url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) inputs = self.processor(images=image, return_tensors="pt") features = self.model.get_image_features(**inputs) return features[0].tolist()