from dotenv import load_dotenv from transformers import BlipForConditionalGeneration, BlipProcessor import torch import litserve as ls import os load_dotenv() hf_token = os.getenv("HUGGINGFACE") class RedionesBlipModel(): def __init__(self): self.model_name = "Salesforce/blip-image-captioning-base" self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.token = hf_token def setup(self, device): device = self.device self.model = BlipForConditionalGeneration.from_pretrained(self.model_name, use_auth_token=self.token, ) self.tokenizer = BlipProcessor.from_pretrained(self.model_name, use_auth_token=self.token) self.model.to(device) self.model.eval() def predict(self, image): input_text = self.tokenizer(image, return_tensors="pt") outputs = self.model.generate(input_ids = input_text["input_ids"].to(self.device), max_new_tokens=50) return outputs